vellum 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -2
- package/bun.lock +5 -2
- package/package.json +4 -2
- package/scripts/capture-x-graphql.ts +562 -0
- package/scripts/ipc/check-swift-decoder-drift.ts +2 -1
- package/scripts/test.sh +5 -0
- package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +161 -34
- package/src/__tests__/account-registry.test.ts +2 -1
- package/src/__tests__/agent-heartbeat-service.test.ts +250 -0
- package/src/__tests__/app-bundler.test.ts +12 -33
- package/src/__tests__/asset-materialize-tool.test.ts +16 -15
- package/src/__tests__/asset-search-tool.test.ts +23 -22
- package/src/__tests__/attachments-store.test.ts +56 -127
- package/src/__tests__/browser-skill-baseline-tool-payload.test.ts +5 -4
- package/src/__tests__/browser-skill-endstate.test.ts +5 -8
- package/src/__tests__/call-bridge.test.ts +385 -0
- package/src/__tests__/call-constants.test.ts +40 -0
- package/src/__tests__/call-orchestrator.test.ts +454 -0
- package/src/__tests__/call-recovery.test.ts +518 -0
- package/src/__tests__/call-routes-http.test.ts +459 -0
- package/src/__tests__/call-state-machine.test.ts +143 -0
- package/src/__tests__/call-state.test.ts +133 -0
- package/src/__tests__/call-store.test.ts +691 -0
- package/src/__tests__/cli-discover.test.ts +1 -1
- package/src/__tests__/commit-message-enrichment-service.test.ts +550 -0
- package/src/__tests__/compaction.benchmark.test.ts +176 -0
- package/src/__tests__/computer-use-tools.test.ts +250 -0
- package/src/__tests__/config-schema.test.ts +348 -3
- package/src/__tests__/conflict-store.test.ts +2 -1
- package/src/__tests__/contacts-tools.test.ts +331 -0
- package/src/__tests__/conversation-store.test.ts +30 -32
- package/src/__tests__/credential-security-invariants.test.ts +4 -0
- package/src/__tests__/date-context.test.ts +373 -0
- package/src/__tests__/db-schedule-syntax-migration.test.ts +129 -0
- package/src/__tests__/doordash-session.test.ts +9 -0
- package/src/__tests__/fixtures/media-reuse-fixtures.ts +3 -3
- package/src/__tests__/followup-tools.test.ts +303 -0
- package/src/__tests__/handlers-twitter-config.test.ts +718 -0
- package/src/__tests__/intent-routing.test.ts +64 -57
- package/src/__tests__/ipc-roundtrip.benchmark.test.ts +237 -0
- package/src/__tests__/ipc-snapshot.test.ts +96 -28
- package/src/__tests__/llm-usage-store.test.ts +3 -8
- package/src/__tests__/media-generate-image.test.ts +1 -1
- package/src/__tests__/media-reuse-story.e2e.test.ts +7 -7
- package/src/__tests__/memory-retrieval.benchmark.test.ts +430 -0
- package/src/__tests__/parallel-tool.benchmark.test.ts +294 -0
- package/src/__tests__/playbook-tools.test.ts +342 -0
- package/src/__tests__/profile-compiler.test.ts +2 -1
- package/src/__tests__/provider-streaming.benchmark.test.ts +773 -0
- package/src/__tests__/recurrence-engine-rruleset.test.ts +78 -0
- package/src/__tests__/recurrence-engine.test.ts +69 -0
- package/src/__tests__/recurrence-types.test.ts +71 -0
- package/src/__tests__/registry.test.ts +17 -10
- package/src/__tests__/relay-server.test.ts +633 -0
- package/src/__tests__/reminder-store.test.ts +6 -3
- package/src/__tests__/reminder.test.ts +43 -77
- package/src/__tests__/run-orchestrator-assistant-events.test.ts +222 -0
- package/src/__tests__/run-orchestrator.test.ts +7 -7
- package/src/__tests__/runtime-attachment-metadata.test.ts +19 -20
- package/src/__tests__/runtime-runs-http.test.ts +5 -23
- package/src/__tests__/runtime-runs.test.ts +11 -11
- package/src/__tests__/schedule-store.test.ts +482 -0
- package/src/__tests__/schedule-tools.test.ts +700 -0
- package/src/__tests__/scheduler-recurrence.test.ts +329 -0
- package/src/__tests__/server-history-render.test.ts +14 -13
- package/src/__tests__/session-error.test.ts +28 -0
- package/src/__tests__/session-init.benchmark.test.ts +462 -0
- package/src/__tests__/session-queue.test.ts +89 -16
- package/src/__tests__/session-runtime-assembly.test.ts +161 -0
- package/src/__tests__/session-surfaces-task-progress.test.ts +104 -0
- package/src/__tests__/signup-e2e.test.ts +2 -1
- package/src/__tests__/skill-projection.benchmark.test.ts +328 -0
- package/src/__tests__/skill-script-runner.test.ts +159 -0
- package/src/__tests__/speaker-identification.test.ts +52 -0
- package/src/__tests__/subagent-manager-notify.test.ts +42 -10
- package/src/__tests__/subagent-tools.test.ts +141 -41
- package/src/__tests__/task-compiler.test.ts +2 -1
- package/src/__tests__/task-runner.test.ts +2 -1
- package/src/__tests__/task-scheduler.test.ts +2 -1
- package/src/__tests__/task-tools.test.ts +49 -56
- package/src/__tests__/tool-audit-listener.test.ts +1 -0
- package/src/__tests__/tool-domain-event-publisher.test.ts +2 -0
- package/src/__tests__/tool-execution-pipeline.benchmark.test.ts +500 -0
- package/src/__tests__/tool-executor.test.ts +13 -17
- package/src/__tests__/turn-commit.test.ts +273 -2
- package/src/__tests__/twilio-provider.test.ts +143 -0
- package/src/__tests__/twilio-routes.test.ts +789 -0
- package/src/__tests__/twitter-auth-handler.test.ts +581 -0
- package/src/__tests__/view-image-tool.test.ts +217 -0
- package/src/__tests__/workspace-git-service.test.ts +403 -0
- package/src/__tests__/workspace-heartbeat-service.test.ts +141 -2
- package/src/agent-heartbeat/agent-heartbeat-service.ts +155 -0
- package/src/bundler/app-bundler.ts +35 -14
- package/src/calls/call-bridge.ts +95 -0
- package/src/calls/call-constants.ts +48 -0
- package/src/calls/call-domain.ts +276 -0
- package/src/calls/call-orchestrator.ts +390 -0
- package/src/calls/call-recovery.ts +207 -0
- package/src/calls/call-state-machine.ts +68 -0
- package/src/calls/call-state.ts +64 -0
- package/src/calls/call-store.ts +416 -0
- package/src/calls/relay-server.ts +335 -0
- package/src/calls/speaker-identification.ts +213 -0
- package/src/calls/twilio-config.ts +34 -0
- package/src/calls/twilio-provider.ts +173 -0
- package/src/calls/twilio-routes.ts +250 -0
- package/src/calls/types.ts +37 -0
- package/src/calls/voice-provider.ts +14 -0
- package/src/cli/config-commands.ts +334 -0
- package/src/cli/core-commands.ts +776 -0
- package/src/cli/doordash.ts +256 -25
- package/src/cli/ipc-client.ts +82 -0
- package/src/cli/map.ts +246 -0
- package/src/cli/twitter.ts +575 -0
- package/src/cli.ts +7 -5
- package/src/commands/__tests__/cc-command-registry.test.ts +319 -0
- package/src/commands/cc-command-registry.ts +209 -0
- package/src/config/bundled-skills/contacts/SKILL.md +39 -0
- package/src/config/bundled-skills/contacts/TOOLS.json +122 -0
- package/src/config/bundled-skills/contacts/tools/contact-merge.ts +9 -0
- package/src/config/bundled-skills/contacts/tools/contact-search.ts +9 -0
- package/src/config/bundled-skills/contacts/tools/contact-upsert.ts +9 -0
- package/src/config/bundled-skills/document/SKILL.md +18 -0
- package/src/config/bundled-skills/document/TOOLS.json +53 -0
- package/src/config/bundled-skills/document/tools/document-create.ts +9 -0
- package/src/config/bundled-skills/document/tools/document-update.ts +9 -0
- package/src/config/bundled-skills/doordash/SKILL.md +163 -0
- package/src/config/bundled-skills/followups/SKILL.md +32 -0
- package/src/config/bundled-skills/followups/TOOLS.json +100 -0
- package/src/config/bundled-skills/followups/tools/followup-create.ts +9 -0
- package/src/config/bundled-skills/followups/tools/followup-list.ts +9 -0
- package/src/config/bundled-skills/followups/tools/followup-resolve.ts +9 -0
- package/src/config/bundled-skills/image-studio/TOOLS.json +2 -2
- package/src/config/bundled-skills/image-studio/tools/media-generate-image.ts +2 -24
- package/src/config/bundled-skills/messaging/tools/messaging-analyze-style.ts +2 -1
- package/src/config/bundled-skills/playbooks/SKILL.md +31 -0
- package/src/config/bundled-skills/playbooks/TOOLS.json +126 -0
- package/src/config/bundled-skills/playbooks/tools/playbook-create.ts +9 -0
- package/src/config/bundled-skills/playbooks/tools/playbook-delete.ts +9 -0
- package/src/config/bundled-skills/playbooks/tools/playbook-list.ts +9 -0
- package/src/config/bundled-skills/playbooks/tools/playbook-update.ts +9 -0
- package/src/config/bundled-skills/reminder/SKILL.md +20 -0
- package/src/config/bundled-skills/reminder/TOOLS.json +67 -0
- package/src/config/bundled-skills/reminder/tools/reminder-cancel.ts +9 -0
- package/src/config/bundled-skills/reminder/tools/reminder-create.ts +9 -0
- package/src/config/bundled-skills/reminder/tools/reminder-list.ts +9 -0
- package/src/config/bundled-skills/schedule/SKILL.md +74 -0
- package/src/config/bundled-skills/schedule/TOOLS.json +135 -0
- package/src/config/bundled-skills/schedule/tools/schedule-create.ts +9 -0
- package/src/config/bundled-skills/schedule/tools/schedule-delete.ts +9 -0
- package/src/config/bundled-skills/schedule/tools/schedule-list.ts +9 -0
- package/src/config/bundled-skills/schedule/tools/schedule-update.ts +9 -0
- package/src/config/bundled-skills/subagent/SKILL.md +25 -0
- package/src/config/bundled-skills/subagent/TOOLS.json +107 -0
- package/src/config/bundled-skills/subagent/tools/subagent-abort.ts +9 -0
- package/src/config/bundled-skills/subagent/tools/subagent-message.ts +9 -0
- package/src/config/bundled-skills/subagent/tools/subagent-read.ts +9 -0
- package/src/config/bundled-skills/subagent/tools/subagent-spawn.ts +9 -0
- package/src/config/bundled-skills/subagent/tools/subagent-status.ts +9 -0
- package/src/config/bundled-skills/tasks/SKILL.md +28 -0
- package/src/config/bundled-skills/tasks/TOOLS.json +256 -0
- package/src/config/bundled-skills/tasks/tools/task-delete.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-list-add.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-list-remove.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-list-show.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-list-update.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-list.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-run.ts +9 -0
- package/src/config/bundled-skills/tasks/tools/task-save.ts +9 -0
- package/src/config/bundled-skills/twitter/SKILL.md +134 -0
- package/src/config/bundled-skills/watcher/SKILL.md +27 -0
- package/src/config/bundled-skills/watcher/TOOLS.json +147 -0
- package/src/config/bundled-skills/watcher/tools/watcher-create.ts +9 -0
- package/src/config/bundled-skills/watcher/tools/watcher-delete.ts +9 -0
- package/src/config/bundled-skills/watcher/tools/watcher-digest.ts +9 -0
- package/src/config/bundled-skills/watcher/tools/watcher-list.ts +9 -0
- package/src/config/bundled-skills/watcher/tools/watcher-update.ts +9 -0
- package/src/config/defaults.ts +44 -0
- package/src/config/loader.ts +4 -1
- package/src/config/schema.ts +218 -1
- package/src/config/system-prompt.ts +100 -6
- package/src/config/templates/IDENTITY.md +7 -0
- package/src/config/types.ts +5 -0
- package/src/contacts/contact-store.ts +4 -4
- package/src/daemon/assistant-attachments.ts +10 -0
- package/src/daemon/classifier.ts +3 -1
- package/src/daemon/computer-use-session.ts +3 -1
- package/src/daemon/date-context.ts +136 -0
- package/src/daemon/handlers/apps.ts +16 -1
- package/src/daemon/handlers/browser.ts +54 -0
- package/src/daemon/handlers/computer-use.ts +7 -1
- package/src/daemon/handlers/config.ts +192 -4
- package/src/daemon/handlers/diagnostics.ts +5 -1
- package/src/daemon/handlers/documents.ts +18 -29
- package/src/daemon/handlers/home-base.ts +5 -1
- package/src/daemon/handlers/index.ts +40 -271
- package/src/daemon/handlers/misc.ts +9 -1
- package/src/daemon/handlers/publish.ts +6 -1
- package/src/daemon/handlers/sessions.ts +65 -12
- package/src/daemon/handlers/shared.ts +36 -1
- package/src/daemon/handlers/signing.ts +37 -0
- package/src/daemon/handlers/skills.ts +20 -6
- package/src/daemon/handlers/subagents.ts +8 -3
- package/src/daemon/handlers/twitter-auth.ts +169 -0
- package/src/daemon/handlers/work-items.ts +495 -39
- package/src/daemon/ipc-contract-inventory.json +40 -4
- package/src/daemon/ipc-contract.ts +185 -37
- package/src/daemon/ipc-protocol.ts +7 -2
- package/src/daemon/lifecycle.ts +48 -5
- package/src/daemon/main.ts +10 -4
- package/src/daemon/ride-shotgun-handler.ts +74 -10
- package/src/daemon/server.ts +144 -29
- package/src/daemon/session-agent-loop.ts +887 -0
- package/src/daemon/session-attachments.ts +28 -5
- package/src/daemon/session-error.ts +24 -3
- package/src/daemon/session-lifecycle.ts +147 -0
- package/src/daemon/session-media-retry.ts +147 -0
- package/src/daemon/session-messaging.ts +145 -0
- package/src/daemon/session-notifiers.ts +164 -0
- package/src/daemon/session-process.ts +2 -2
- package/src/daemon/session-queue-manager.ts +1 -0
- package/src/daemon/session-runtime-assembly.ts +52 -0
- package/src/daemon/session-skill-tools.ts +124 -5
- package/src/daemon/session-slash.ts +3 -0
- package/src/daemon/session-surfaces.ts +77 -2
- package/src/daemon/session-tool-setup.ts +222 -2
- package/src/daemon/session-usage.ts +0 -2
- package/src/daemon/session.ts +114 -1365
- package/src/daemon/video-thumbnail.ts +60 -0
- package/src/doordash/client.ts +121 -27
- package/src/doordash/queries.ts +1 -2
- package/src/export/formatter.ts +3 -1
- package/src/followups/followup-store.ts +4 -2
- package/src/followups/types.ts +6 -0
- package/src/hooks/templates.ts +1 -1
- package/src/index.ts +32 -1151
- package/src/media/gemini-image-service.ts +1 -1
- package/src/memory/attachments-store.ts +28 -83
- package/src/memory/channel-delivery-store.ts +7 -21
- package/src/memory/clarification-resolver.ts +6 -5
- package/src/memory/contradiction-checker.ts +3 -2
- package/src/memory/conversation-key-store.ts +10 -29
- package/src/memory/conversation-store.ts +2 -1
- package/src/memory/db.ts +362 -2
- package/src/memory/entity-extractor.ts +6 -3
- package/src/memory/items-extractor.ts +5 -4
- package/src/memory/jobs-store.ts +3 -2
- package/src/memory/llm-usage-store.ts +1 -2
- package/src/memory/runs-store.ts +1 -2
- package/src/memory/schema.ts +65 -2
- package/src/messaging/style-analyzer.ts +3 -2
- package/src/messaging/thread-summarizer.ts +8 -12
- package/src/messaging/triage-engine.ts +4 -2
- package/src/providers/openrouter/client.ts +20 -0
- package/src/providers/registry.ts +8 -0
- package/src/runtime/http-server.ts +277 -25
- package/src/runtime/http-types.ts +0 -2
- package/src/runtime/routes/attachment-routes.ts +5 -6
- package/src/runtime/routes/call-routes.ts +140 -0
- package/src/runtime/routes/channel-routes.ts +12 -19
- package/src/runtime/routes/conversation-routes.ts +5 -9
- package/src/runtime/routes/run-routes.ts +4 -8
- package/src/runtime/run-orchestrator.ts +39 -6
- package/src/schedule/recurrence-engine.ts +138 -0
- package/src/schedule/recurrence-types.ts +67 -0
- package/src/schedule/schedule-store.ts +102 -57
- package/src/schedule/scheduler.ts +9 -6
- package/src/security/oauth2.ts +29 -4
- package/src/security/secret-allowlist.ts +46 -0
- package/src/skills/clawhub.ts +1 -1
- package/src/subagent/manager.ts +40 -8
- package/src/swarm/backend-claude-code.ts +64 -9
- package/src/swarm/worker-prompts.ts +2 -1
- package/src/tasks/SPEC.md +34 -28
- package/src/tasks/ephemeral-permissions.ts +16 -7
- package/src/tasks/task-compiler.ts +5 -4
- package/src/tasks/task-runner.ts +10 -5
- package/src/tasks/task-scheduler.ts +1 -1
- package/src/tasks/tool-sanitizer.ts +36 -0
- package/src/tools/assets/search.ts +4 -4
- package/src/tools/browser/api-map.ts +220 -0
- package/src/tools/browser/auto-navigate.ts +270 -0
- package/src/tools/browser/browser-execution.ts +2 -1
- package/src/tools/browser/browser-manager.ts +2 -2
- package/src/tools/browser/network-recorder.ts +5 -4
- package/src/tools/browser/x-auto-navigate.ts +207 -0
- package/src/tools/calls/call-end.ts +67 -0
- package/src/tools/calls/call-start.ts +73 -0
- package/src/tools/calls/call-status.ts +81 -0
- package/src/tools/claude-code/claude-code.ts +77 -11
- package/src/tools/contacts/contact-merge.ts +46 -78
- package/src/tools/contacts/contact-search.ts +35 -79
- package/src/tools/contacts/contact-upsert.ts +35 -108
- package/src/tools/credentials/vault.ts +21 -5
- package/src/tools/document/document-tool.ts +71 -144
- package/src/tools/executor.ts +129 -10
- package/src/tools/followups/followup_create.ts +46 -88
- package/src/tools/followups/followup_list.ts +34 -74
- package/src/tools/followups/followup_resolve.ts +31 -66
- package/src/tools/host-terminal/cli-discover.ts +2 -1
- package/src/tools/host-terminal/host-shell.ts +10 -0
- package/src/tools/memory/handlers.ts +5 -4
- package/src/tools/network/__tests__/web-search.test.ts +427 -0
- package/src/tools/network/script-proxy/__tests__/logging.test.ts +248 -0
- package/src/tools/network/script-proxy/__tests__/policy.test.ts +234 -0
- package/src/tools/network/script-proxy/__tests__/router.test.ts +76 -0
- package/src/tools/network/web-fetch.ts +18 -6
- package/src/tools/playbooks/index.ts +4 -5
- package/src/tools/playbooks/playbook-create.ts +3 -47
- package/src/tools/playbooks/playbook-delete.ts +1 -25
- package/src/tools/playbooks/playbook-list.ts +1 -28
- package/src/tools/playbooks/playbook-update.ts +3 -51
- package/src/tools/registry.ts +2 -4
- package/src/tools/reminder/reminder.ts +5 -78
- package/src/tools/schedule/create.ts +69 -74
- package/src/tools/schedule/delete.ts +21 -47
- package/src/tools/schedule/list.ts +55 -74
- package/src/tools/schedule/update.ts +77 -84
- package/src/tools/subagent/abort.ts +29 -58
- package/src/tools/subagent/message.ts +30 -63
- package/src/tools/subagent/read.ts +53 -84
- package/src/tools/subagent/spawn.ts +43 -82
- package/src/tools/subagent/status.ts +42 -71
- package/src/tools/swarm/delegate.ts +2 -1
- package/src/tools/tasks/index.ts +8 -6
- package/src/tools/tasks/task-delete.ts +69 -56
- package/src/tools/tasks/task-list.ts +31 -52
- package/src/tools/tasks/task-run.ts +74 -102
- package/src/tools/tasks/task-save.ts +33 -65
- package/src/tools/tasks/work-item-enqueue.ts +192 -134
- package/src/tools/tasks/work-item-list.ts +33 -78
- package/src/tools/tasks/work-item-remove.ts +60 -0
- package/src/tools/tasks/work-item-update.ts +114 -0
- package/src/tools/terminal/backends/native.ts +3 -1
- package/src/tools/tool-manifest.ts +20 -74
- package/src/tools/types.ts +6 -0
- package/src/tools/ui-surface/definitions.ts +6 -1
- package/src/tools/watch/screen-watch.ts +3 -1
- package/src/tools/watcher/create.ts +52 -98
- package/src/tools/watcher/delete.ts +20 -46
- package/src/tools/watcher/digest.ts +36 -70
- package/src/tools/watcher/list.ts +49 -79
- package/src/tools/watcher/update.ts +45 -91
- package/src/twitter/client.ts +690 -0
- package/src/twitter/session.ts +91 -0
- package/src/usage/types.ts +0 -1
- package/src/util/truncate.ts +6 -0
- package/src/watcher/providers/slack.ts +2 -1
- package/src/watcher/watcher-store.ts +3 -2
- package/src/work-items/work-item-store.ts +236 -2
- package/src/workspace/commit-message-enrichment-service.ts +284 -0
- package/src/workspace/commit-message-provider.ts +95 -0
- package/src/workspace/git-service.ts +272 -52
- package/src/workspace/heartbeat-service.ts +70 -13
- package/src/workspace/provider-commit-message-generator.ts +242 -0
- package/src/workspace/turn-commit.ts +100 -51
- package/src/tools/contacts/index.ts +0 -4
- package/src/tools/document/index.ts +0 -5
- package/src/tools/followups/index.ts +0 -3
- package/src/tools/subagent/index.ts +0 -5
- /package/src/__tests__/{memory-context-benchmark.test.ts → memory-context-benchmark.benchmark.test.ts} +0 -0
|
@@ -0,0 +1,500 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool Execution Pipeline Benchmark
|
|
3
|
+
*
|
|
4
|
+
* Measures the overhead of each phase in the permission/security pipeline:
|
|
5
|
+
* 1. classifyRisk — risk classification
|
|
6
|
+
* 2. check — trust rule matching (both no-rule fallback and matched-rule paths)
|
|
7
|
+
* 3. scanText — secret scanning on output
|
|
8
|
+
* 4. ToolExecutor.execute() — full pipeline overhead with noop/slow tools
|
|
9
|
+
*
|
|
10
|
+
* Target ranges:
|
|
11
|
+
* - p50 pipeline overhead (classifyRisk + check) < 20ms for pre-approved tools
|
|
12
|
+
* - p95 pipeline overhead < 50ms
|
|
13
|
+
* - Overhead is constant regardless of tool execution time
|
|
14
|
+
* - Secret scanning < 5ms for short outputs (< 1KB)
|
|
15
|
+
* - Secret scanning < 50ms for large outputs (100KB)
|
|
16
|
+
* - ToolExecutor overhead < 20ms regardless of tool execution time
|
|
17
|
+
*/
|
|
18
|
+
import { describe, test, expect, beforeAll, afterAll, mock } from 'bun:test';
|
|
19
|
+
import { mkdtempSync, rmSync } from 'node:fs';
|
|
20
|
+
import { tmpdir } from 'node:os';
|
|
21
|
+
import { join } from 'node:path';
|
|
22
|
+
|
|
23
|
+
const testDir = mkdtempSync(join(tmpdir(), 'tool-pipeline-bench-'));
|
|
24
|
+
|
|
25
|
+
// Local registry for ToolExecutor tests — the mock delegates to this map
|
|
26
|
+
// so that registerTool/getTool/getAllTools work for our benchmark tools.
|
|
27
|
+
const localRegistry = new Map<string, import('../tools/types.js').Tool>();
|
|
28
|
+
|
|
29
|
+
// Mocks must precede imports of modules under test.
|
|
30
|
+
mock.module('../util/platform.js', () => ({
|
|
31
|
+
getDataDir: () => testDir,
|
|
32
|
+
isMacOS: () => process.platform === 'darwin',
|
|
33
|
+
isLinux: () => process.platform === 'linux',
|
|
34
|
+
isWindows: () => process.platform === 'win32',
|
|
35
|
+
getSocketPath: () => join(testDir, 'test.sock'),
|
|
36
|
+
getPidPath: () => join(testDir, 'test.pid'),
|
|
37
|
+
getDbPath: () => join(testDir, 'test.db'),
|
|
38
|
+
getLogPath: () => join(testDir, 'test.log'),
|
|
39
|
+
ensureDataDir: () => {},
|
|
40
|
+
getHooksDir: () => join(testDir, 'hooks'),
|
|
41
|
+
}));
|
|
42
|
+
|
|
43
|
+
mock.module('../util/logger.js', () => ({
|
|
44
|
+
getLogger: () => new Proxy({} as Record<string, unknown>, {
|
|
45
|
+
get: () => () => {},
|
|
46
|
+
}),
|
|
47
|
+
isDebug: () => false,
|
|
48
|
+
}));
|
|
49
|
+
|
|
50
|
+
// Allow toggling between no-rule and matched-rule paths
|
|
51
|
+
let mockRuleResponse: import('../permissions/types.js').TrustRule | null = null;
|
|
52
|
+
|
|
53
|
+
mock.module('../permissions/trust-store.js', () => ({
|
|
54
|
+
addRule: () => {},
|
|
55
|
+
findHighestPriorityRule: () => mockRuleResponse,
|
|
56
|
+
clearCache: () => {},
|
|
57
|
+
}));
|
|
58
|
+
|
|
59
|
+
mock.module('../config/loader.js', () => ({
|
|
60
|
+
getConfig: () => ({
|
|
61
|
+
provider: 'mock-provider',
|
|
62
|
+
timeouts: { permissionTimeoutSec: 5, toolExecutionTimeoutSec: 120 },
|
|
63
|
+
permissions: { mode: 'legacy' },
|
|
64
|
+
skills: { load: { extraDirs: [] } },
|
|
65
|
+
secretDetection: { enabled: true, entropyThreshold: 4.0, action: 'warn' },
|
|
66
|
+
sandbox: { enabled: false },
|
|
67
|
+
contextWindow: {},
|
|
68
|
+
memory: {},
|
|
69
|
+
}),
|
|
70
|
+
}));
|
|
71
|
+
|
|
72
|
+
mock.module('../config/skills.js', () => ({
|
|
73
|
+
resolveSkillSelector: () => ({ skill: null }),
|
|
74
|
+
loadSkillCatalog: () => [],
|
|
75
|
+
}));
|
|
76
|
+
|
|
77
|
+
mock.module('../tools/registry.js', () => ({
|
|
78
|
+
getTool: (name: string) => localRegistry.get(name),
|
|
79
|
+
getAllTools: () => Array.from(localRegistry.values()),
|
|
80
|
+
registerTool: (tool: import('../tools/types.js').Tool) => { localRegistry.set(tool.name, tool); },
|
|
81
|
+
}));
|
|
82
|
+
|
|
83
|
+
mock.module('../hooks/manager.js', () => ({
|
|
84
|
+
getHookManager: () => ({
|
|
85
|
+
trigger: () => Promise.resolve({ blocked: false }),
|
|
86
|
+
}),
|
|
87
|
+
}));
|
|
88
|
+
|
|
89
|
+
import { classifyRisk, check } from '../permissions/checker.js';
|
|
90
|
+
import { scanText, DEFAULT_ENTROPY_CONFIG } from '../security/secret-scanner.js';
|
|
91
|
+
import { RiskLevel } from '../permissions/types.js';
|
|
92
|
+
import { ToolExecutor } from '../tools/executor.js';
|
|
93
|
+
import { PermissionPrompter } from '../permissions/prompter.js';
|
|
94
|
+
import type { Tool, ToolContext, ToolExecutionResult } from '../tools/types.js';
|
|
95
|
+
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Helpers
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
function percentile(values: number[], p: number): number {
|
|
101
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
102
|
+
const idx = Math.ceil((p / 100) * sorted.length) - 1;
|
|
103
|
+
return sorted[Math.max(0, idx)];
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async function benchmarkAsync<T>(
|
|
107
|
+
fn: () => Promise<T>,
|
|
108
|
+
iterations: number,
|
|
109
|
+
): Promise<{ timings: number[]; results: T[] }> {
|
|
110
|
+
const timings: number[] = [];
|
|
111
|
+
const results: T[] = [];
|
|
112
|
+
for (let i = 0; i < iterations; i++) {
|
|
113
|
+
const start = performance.now();
|
|
114
|
+
const result = await fn();
|
|
115
|
+
timings.push(performance.now() - start);
|
|
116
|
+
results.push(result);
|
|
117
|
+
}
|
|
118
|
+
return { timings, results };
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function benchmarkSync<T>(
|
|
122
|
+
fn: () => T,
|
|
123
|
+
iterations: number,
|
|
124
|
+
): { timings: number[]; results: T[] } {
|
|
125
|
+
const timings: number[] = [];
|
|
126
|
+
const results: T[] = [];
|
|
127
|
+
for (let i = 0; i < iterations; i++) {
|
|
128
|
+
const start = performance.now();
|
|
129
|
+
const result = fn();
|
|
130
|
+
timings.push(performance.now() - start);
|
|
131
|
+
results.push(result);
|
|
132
|
+
}
|
|
133
|
+
return { timings, results };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function generateLargeOutput(sizeBytes: number): string {
|
|
137
|
+
// Generate realistic-looking tool output with varied content
|
|
138
|
+
const lines: string[] = [];
|
|
139
|
+
const words = [
|
|
140
|
+
'function', 'const', 'let', 'return', 'import', 'export',
|
|
141
|
+
'class', 'interface', 'type', 'async', 'await', 'Promise',
|
|
142
|
+
'string', 'number', 'boolean', 'undefined', 'null', 'void',
|
|
143
|
+
];
|
|
144
|
+
let currentSize = 0;
|
|
145
|
+
while (currentSize < sizeBytes) {
|
|
146
|
+
const lineWords: string[] = [];
|
|
147
|
+
for (let w = 0; w < 10; w++) {
|
|
148
|
+
lineWords.push(words[Math.floor(Math.random() * words.length)]);
|
|
149
|
+
}
|
|
150
|
+
const line = lineWords.join(' ');
|
|
151
|
+
lines.push(line);
|
|
152
|
+
currentSize += line.length + 1; // +1 for newline
|
|
153
|
+
}
|
|
154
|
+
return lines.join('\n').slice(0, sizeBytes);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// ---------------------------------------------------------------------------
|
|
158
|
+
// Benchmark suite
|
|
159
|
+
// ---------------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
const ITERATIONS = 100;
|
|
162
|
+
const WARMUP = 5;
|
|
163
|
+
|
|
164
|
+
describe('Tool execution pipeline benchmark', () => {
|
|
165
|
+
// Warm up the parser/modules
|
|
166
|
+
beforeAll(async () => {
|
|
167
|
+
for (let i = 0; i < WARMUP; i++) {
|
|
168
|
+
await classifyRisk('file_read', { path: '/tmp/test.ts' }, '/tmp');
|
|
169
|
+
await check('file_read', { path: '/tmp/test.ts' }, '/tmp');
|
|
170
|
+
scanText('no secrets here');
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
afterAll(() => {
|
|
175
|
+
try {
|
|
176
|
+
rmSync(testDir, { recursive: true });
|
|
177
|
+
} catch {
|
|
178
|
+
// best effort cleanup
|
|
179
|
+
}
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
test('classifyRisk: low-risk tool (file_read) is fast', async () => {
|
|
183
|
+
const { timings } = await benchmarkAsync(
|
|
184
|
+
() => classifyRisk('file_read', { path: '/tmp/test.ts' }, '/tmp'),
|
|
185
|
+
ITERATIONS,
|
|
186
|
+
);
|
|
187
|
+
|
|
188
|
+
const p50 = percentile(timings, 50);
|
|
189
|
+
const p95 = percentile(timings, 95);
|
|
190
|
+
|
|
191
|
+
expect(p50).toBeLessThan(5);
|
|
192
|
+
expect(p95).toBeLessThan(10);
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
test('classifyRisk: bash command classification', async () => {
|
|
196
|
+
const { timings, results } = await benchmarkAsync(
|
|
197
|
+
() => classifyRisk('bash', { command: 'ls -la /tmp' }, '/tmp'),
|
|
198
|
+
ITERATIONS,
|
|
199
|
+
);
|
|
200
|
+
|
|
201
|
+
const p50 = percentile(timings, 50);
|
|
202
|
+
const p95 = percentile(timings, 95);
|
|
203
|
+
|
|
204
|
+
// Bash classification involves shell parsing so it is slower
|
|
205
|
+
expect(p50).toBeLessThan(15);
|
|
206
|
+
expect(p95).toBeLessThan(40);
|
|
207
|
+
// Verify correctness: ls should be low risk
|
|
208
|
+
expect(results[0]).toBe(RiskLevel.Low);
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
test('classifyRisk: medium-risk tool (file_write)', async () => {
|
|
212
|
+
const { timings, results } = await benchmarkAsync(
|
|
213
|
+
() => classifyRisk('file_write', { path: '/tmp/out.txt' }, '/tmp'),
|
|
214
|
+
ITERATIONS,
|
|
215
|
+
);
|
|
216
|
+
|
|
217
|
+
const p50 = percentile(timings, 50);
|
|
218
|
+
expect(p50).toBeLessThan(5);
|
|
219
|
+
expect(results[0]).toBe(RiskLevel.Medium);
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
test('check: full permission check for low-risk tool', async () => {
|
|
223
|
+
const { timings, results } = await benchmarkAsync(
|
|
224
|
+
() => check('file_read', { path: '/tmp/test.ts' }, '/tmp'),
|
|
225
|
+
ITERATIONS,
|
|
226
|
+
);
|
|
227
|
+
|
|
228
|
+
const p50 = percentile(timings, 50);
|
|
229
|
+
const p95 = percentile(timings, 95);
|
|
230
|
+
|
|
231
|
+
// Full check includes classifyRisk + trust rule lookup
|
|
232
|
+
expect(p50).toBeLessThan(10);
|
|
233
|
+
expect(p95).toBeLessThan(20);
|
|
234
|
+
// Low-risk with no matching rule should auto-allow
|
|
235
|
+
expect(results[0].decision).toBe('allow');
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
test('check: full permission check for bash command', async () => {
|
|
239
|
+
const { timings, results } = await benchmarkAsync(
|
|
240
|
+
() => check('bash', { command: 'git status' }, '/tmp'),
|
|
241
|
+
ITERATIONS,
|
|
242
|
+
);
|
|
243
|
+
|
|
244
|
+
const p50 = percentile(timings, 50);
|
|
245
|
+
const p95 = percentile(timings, 95);
|
|
246
|
+
|
|
247
|
+
// Bash involves shell parsing + trust rule lookup
|
|
248
|
+
expect(p50).toBeLessThan(20);
|
|
249
|
+
expect(p95).toBeLessThan(50);
|
|
250
|
+
// git status is low risk, should auto-allow
|
|
251
|
+
expect(results[0].decision).toBe('allow');
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
test('check: matched allow-rule path for medium-risk tool', async () => {
|
|
255
|
+
// Exercise the code path where findHighestPriorityRule returns a matching
|
|
256
|
+
// allow rule, rather than always falling through to the no-rule default.
|
|
257
|
+
mockRuleResponse = {
|
|
258
|
+
id: 'bench:allow-file_write',
|
|
259
|
+
tool: 'file_write',
|
|
260
|
+
pattern: '**',
|
|
261
|
+
scope: '/tmp',
|
|
262
|
+
decision: 'allow',
|
|
263
|
+
priority: 90,
|
|
264
|
+
createdAt: Date.now(),
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
try {
|
|
268
|
+
const { timings, results } = await benchmarkAsync(
|
|
269
|
+
() => check('file_write', { path: '/tmp/out.txt' }, '/tmp'),
|
|
270
|
+
ITERATIONS,
|
|
271
|
+
);
|
|
272
|
+
|
|
273
|
+
const p50 = percentile(timings, 50);
|
|
274
|
+
const p95 = percentile(timings, 95);
|
|
275
|
+
|
|
276
|
+
expect(p50).toBeLessThan(10);
|
|
277
|
+
expect(p95).toBeLessThan(20);
|
|
278
|
+
// Medium-risk with a matching allow rule should auto-allow
|
|
279
|
+
expect(results[0].decision).toBe('allow');
|
|
280
|
+
expect(results[0].matchedRule?.id).toBe('bench:allow-file_write');
|
|
281
|
+
} finally {
|
|
282
|
+
mockRuleResponse = null;
|
|
283
|
+
}
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
test('check: permission cost is stable across different input paths', async () => {
|
|
287
|
+
// Verify that the permission check cost doesn't vary with input path length/complexity.
|
|
288
|
+
// Actual tool-execution-time independence is tested in the ToolExecutor section below.
|
|
289
|
+
const shortPathTimings: number[] = [];
|
|
290
|
+
const longPathTimings: number[] = [];
|
|
291
|
+
|
|
292
|
+
for (let i = 0; i < ITERATIONS; i++) {
|
|
293
|
+
const start1 = performance.now();
|
|
294
|
+
await check('file_read', { path: '/tmp/fast.ts' }, '/tmp');
|
|
295
|
+
shortPathTimings.push(performance.now() - start1);
|
|
296
|
+
|
|
297
|
+
const start2 = performance.now();
|
|
298
|
+
await check('file_read', { path: '/tmp/slow-complex-deeply-nested-file.ts' }, '/tmp');
|
|
299
|
+
longPathTimings.push(performance.now() - start2);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const shortP50 = percentile(shortPathTimings, 50);
|
|
303
|
+
const longP50 = percentile(longPathTimings, 50);
|
|
304
|
+
|
|
305
|
+
// Permission check cost should be roughly the same regardless of path length
|
|
306
|
+
const ratio = Math.max(shortP50, longP50) / Math.max(Math.min(shortP50, longP50), 0.001);
|
|
307
|
+
expect(ratio).toBeLessThan(5);
|
|
308
|
+
});
|
|
309
|
+
|
|
310
|
+
test('scanText: short output (< 1KB) completes quickly', () => {
|
|
311
|
+
const shortOutput = 'Build succeeded. 42 tests passed, 0 failed.\nTime: 1.23s';
|
|
312
|
+
|
|
313
|
+
const { timings } = benchmarkSync(
|
|
314
|
+
() => scanText(shortOutput, DEFAULT_ENTROPY_CONFIG),
|
|
315
|
+
ITERATIONS,
|
|
316
|
+
);
|
|
317
|
+
|
|
318
|
+
const p50 = percentile(timings, 50);
|
|
319
|
+
const p95 = percentile(timings, 95);
|
|
320
|
+
|
|
321
|
+
expect(p50).toBeLessThan(5);
|
|
322
|
+
expect(p95).toBeLessThan(10);
|
|
323
|
+
});
|
|
324
|
+
|
|
325
|
+
test('scanText: large output (100KB) within budget', () => {
|
|
326
|
+
const largeOutput = generateLargeOutput(100 * 1024);
|
|
327
|
+
|
|
328
|
+
const { timings } = benchmarkSync(
|
|
329
|
+
() => scanText(largeOutput, DEFAULT_ENTROPY_CONFIG),
|
|
330
|
+
ITERATIONS,
|
|
331
|
+
);
|
|
332
|
+
|
|
333
|
+
const p50 = percentile(timings, 50);
|
|
334
|
+
const p95 = percentile(timings, 95);
|
|
335
|
+
|
|
336
|
+
expect(p50).toBeLessThan(50);
|
|
337
|
+
expect(p95).toBeLessThan(100);
|
|
338
|
+
});
|
|
339
|
+
|
|
340
|
+
test('scanText: output with secrets is detected without excessive overhead', () => {
|
|
341
|
+
// Build fake secrets programmatically to avoid pre-commit hook false positives
|
|
342
|
+
const fakeGhToken = 'ghp_' + 'A1b2C3d4E5f6G7h8I9j0K1l2M3n4O5p6Q7r8';
|
|
343
|
+
const fakeConnStr = 'postgres://' + 'user:s3cret@db.host.example.com:5432/mydb';
|
|
344
|
+
const outputWithSecrets = [
|
|
345
|
+
'Deploying to production...',
|
|
346
|
+
`Using API key: ${fakeGhToken}`,
|
|
347
|
+
`Connection: ${fakeConnStr}`,
|
|
348
|
+
'Build complete.',
|
|
349
|
+
].join('\n');
|
|
350
|
+
|
|
351
|
+
const { timings, results } = benchmarkSync(
|
|
352
|
+
() => scanText(outputWithSecrets, DEFAULT_ENTROPY_CONFIG),
|
|
353
|
+
ITERATIONS,
|
|
354
|
+
);
|
|
355
|
+
|
|
356
|
+
const p50 = percentile(timings, 50);
|
|
357
|
+
expect(p50).toBeLessThan(5);
|
|
358
|
+
|
|
359
|
+
// Verify detection correctness
|
|
360
|
+
expect(results[0].length).toBeGreaterThanOrEqual(2);
|
|
361
|
+
const types = results[0].map((m) => m.type);
|
|
362
|
+
expect(types).toContain('GitHub Token');
|
|
363
|
+
expect(types).toContain('Database Connection String');
|
|
364
|
+
});
|
|
365
|
+
|
|
366
|
+
test('combined pipeline overhead (classifyRisk + check + scanText) stays under budget', async () => {
|
|
367
|
+
const timings: number[] = [];
|
|
368
|
+
|
|
369
|
+
for (let i = 0; i < ITERATIONS; i++) {
|
|
370
|
+
const start = performance.now();
|
|
371
|
+
|
|
372
|
+
// Phase 1: Risk classification
|
|
373
|
+
await classifyRisk('bash', { command: 'git diff HEAD' }, '/tmp');
|
|
374
|
+
// Phase 2: Permission check
|
|
375
|
+
await check('bash', { command: 'git diff HEAD' }, '/tmp');
|
|
376
|
+
// Phase 3: Secret scanning on output
|
|
377
|
+
scanText('diff --git a/file.ts b/file.ts\n+const x = 42;\n-const x = 41;', DEFAULT_ENTROPY_CONFIG);
|
|
378
|
+
|
|
379
|
+
timings.push(performance.now() - start);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const p50 = percentile(timings, 50);
|
|
383
|
+
const p95 = percentile(timings, 95);
|
|
384
|
+
|
|
385
|
+
// Combined pipeline overhead for a pre-approved tool
|
|
386
|
+
expect(p50).toBeLessThan(20);
|
|
387
|
+
expect(p95).toBeLessThan(50);
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
// -------------------------------------------------------------------------
|
|
391
|
+
// ToolExecutor end-to-end overhead benchmarks
|
|
392
|
+
// -------------------------------------------------------------------------
|
|
393
|
+
|
|
394
|
+
describe('ToolExecutor overhead', () => {
|
|
395
|
+
const SLEEP_MS = 50;
|
|
396
|
+
// Fewer iterations for slow-tool tests to avoid timeouts (50ms * 30 = 1.5s)
|
|
397
|
+
const SLOW_ITERATIONS = 30;
|
|
398
|
+
let executor: ToolExecutor;
|
|
399
|
+
const toolContext: ToolContext = {
|
|
400
|
+
workingDir: '/tmp',
|
|
401
|
+
sessionId: 'bench-session',
|
|
402
|
+
conversationId: 'bench-conv',
|
|
403
|
+
};
|
|
404
|
+
|
|
405
|
+
function makeTool(name: string, sleepMs: number): Tool {
|
|
406
|
+
return {
|
|
407
|
+
name,
|
|
408
|
+
description: `Benchmark tool (${sleepMs}ms)`,
|
|
409
|
+
category: 'benchmark',
|
|
410
|
+
defaultRiskLevel: RiskLevel.Low,
|
|
411
|
+
getDefinition: () => ({
|
|
412
|
+
name,
|
|
413
|
+
description: `Benchmark tool (${sleepMs}ms)`,
|
|
414
|
+
input_schema: { type: 'object' as const, properties: {} },
|
|
415
|
+
}),
|
|
416
|
+
execute: async (): Promise<ToolExecutionResult> => {
|
|
417
|
+
if (sleepMs > 0) {
|
|
418
|
+
await new Promise((r) => setTimeout(r, sleepMs));
|
|
419
|
+
}
|
|
420
|
+
return { content: 'ok', isError: false };
|
|
421
|
+
},
|
|
422
|
+
};
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
beforeAll(() => {
|
|
426
|
+
// Auto-allow prompter (never called for low-risk tools, but required by constructor)
|
|
427
|
+
const prompter = new PermissionPrompter(() => {});
|
|
428
|
+
executor = new ToolExecutor(prompter);
|
|
429
|
+
|
|
430
|
+
const noopTool = makeTool('bench_noop', 0);
|
|
431
|
+
const slowTool = makeTool('bench_slow', SLEEP_MS);
|
|
432
|
+
localRegistry.set(noopTool.name, noopTool);
|
|
433
|
+
localRegistry.set(slowTool.name, slowTool);
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
test('ToolExecutor with noop tool: pipeline overhead < 20ms', async () => {
|
|
437
|
+
// Warmup
|
|
438
|
+
for (let i = 0; i < WARMUP; i++) {
|
|
439
|
+
await executor.execute('bench_noop', {}, toolContext);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
const { timings } = await benchmarkAsync(
|
|
443
|
+
() => executor.execute('bench_noop', {}, toolContext),
|
|
444
|
+
ITERATIONS,
|
|
445
|
+
);
|
|
446
|
+
|
|
447
|
+
const p50 = percentile(timings, 50);
|
|
448
|
+
const p95 = percentile(timings, 95);
|
|
449
|
+
|
|
450
|
+
// Full pipeline overhead for a noop tool should be minimal
|
|
451
|
+
expect(p50).toBeLessThan(20);
|
|
452
|
+
expect(p95).toBeLessThan(50);
|
|
453
|
+
});
|
|
454
|
+
|
|
455
|
+
test('ToolExecutor with slow tool (50ms): overhead is constant', async () => {
|
|
456
|
+
// Warmup
|
|
457
|
+
for (let i = 0; i < WARMUP; i++) {
|
|
458
|
+
await executor.execute('bench_slow', {}, toolContext);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
const { timings } = await benchmarkAsync(
|
|
462
|
+
() => executor.execute('bench_slow', {}, toolContext),
|
|
463
|
+
SLOW_ITERATIONS,
|
|
464
|
+
);
|
|
465
|
+
|
|
466
|
+
const p50 = percentile(timings, 50);
|
|
467
|
+
|
|
468
|
+
// Total time should be ~50ms + overhead. Pipeline overhead (total - sleep)
|
|
469
|
+
// should be similar to the noop case.
|
|
470
|
+
expect(p50).toBeGreaterThanOrEqual(SLEEP_MS);
|
|
471
|
+
// Total should not exceed sleep + generous overhead budget
|
|
472
|
+
expect(p50).toBeLessThan(SLEEP_MS + 30);
|
|
473
|
+
}, 10_000);
|
|
474
|
+
|
|
475
|
+
test('overhead subtraction: slow tool overhead matches noop overhead', async () => {
|
|
476
|
+
// Run both tools and compare pipeline overhead
|
|
477
|
+
const noopTimings: number[] = [];
|
|
478
|
+
const slowTimings: number[] = [];
|
|
479
|
+
|
|
480
|
+
for (let i = 0; i < SLOW_ITERATIONS; i++) {
|
|
481
|
+
const s1 = performance.now();
|
|
482
|
+
await executor.execute('bench_noop', {}, toolContext);
|
|
483
|
+
noopTimings.push(performance.now() - s1);
|
|
484
|
+
|
|
485
|
+
const s2 = performance.now();
|
|
486
|
+
await executor.execute('bench_slow', {}, toolContext);
|
|
487
|
+
slowTimings.push(performance.now() - s2);
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
const noopP50 = percentile(noopTimings, 50);
|
|
491
|
+
const slowP50 = percentile(slowTimings, 50);
|
|
492
|
+
|
|
493
|
+
// Overhead = slow_duration - sleep_time. Should be close to noop_duration.
|
|
494
|
+
const slowOverhead = slowP50 - SLEEP_MS;
|
|
495
|
+
|
|
496
|
+
// The overhead portion of the slow tool should be within 10ms of the noop total
|
|
497
|
+
expect(Math.abs(slowOverhead - noopP50)).toBeLessThan(10);
|
|
498
|
+
}, 10_000);
|
|
499
|
+
});
|
|
500
|
+
});
|
|
@@ -993,20 +993,16 @@ describe('isSideEffectTool', () => {
|
|
|
993
993
|
expect(isSideEffectTool('account_manage')).toBe(false);
|
|
994
994
|
});
|
|
995
995
|
|
|
996
|
-
test('
|
|
997
|
-
expect(isSideEffectTool('
|
|
996
|
+
test('reminder_create is a side-effect', () => {
|
|
997
|
+
expect(isSideEffectTool('reminder_create')).toBe(true);
|
|
998
998
|
});
|
|
999
999
|
|
|
1000
|
-
test('
|
|
1001
|
-
expect(isSideEffectTool('
|
|
1000
|
+
test('reminder_cancel is a side-effect', () => {
|
|
1001
|
+
expect(isSideEffectTool('reminder_cancel')).toBe(true);
|
|
1002
1002
|
});
|
|
1003
1003
|
|
|
1004
|
-
test('
|
|
1005
|
-
expect(isSideEffectTool('
|
|
1006
|
-
});
|
|
1007
|
-
|
|
1008
|
-
test('reminder without input is NOT a side-effect', () => {
|
|
1009
|
-
expect(isSideEffectTool('reminder')).toBe(false);
|
|
1004
|
+
test('reminder_list is NOT a side-effect', () => {
|
|
1005
|
+
expect(isSideEffectTool('reminder_list')).toBe(false);
|
|
1010
1006
|
});
|
|
1011
1007
|
|
|
1012
1008
|
test('credential_store store is a side-effect', () => {
|
|
@@ -1262,7 +1258,7 @@ describe('ToolExecutor forcePromptSideEffects enforcement', () => {
|
|
|
1262
1258
|
{ name: 'document_create', input: { title: 'doc', content: 'body' } },
|
|
1263
1259
|
{ name: 'document_update', input: { id: 'doc-1', content: 'updated' } },
|
|
1264
1260
|
{ name: 'account_manage', input: { action: 'create', name: 'acct' } },
|
|
1265
|
-
{ name: '
|
|
1261
|
+
{ name: 'reminder_create', input: { fire_at: '2030-01-01T00:00:00Z', label: 'test', message: 'remind me' } },
|
|
1266
1262
|
{ name: 'credential_store', input: { action: 'store', name: 'api-key', value: 'secret' } },
|
|
1267
1263
|
];
|
|
1268
1264
|
|
|
@@ -1550,13 +1546,13 @@ describe('ToolExecutor forcePromptSideEffects enforcement', () => {
|
|
|
1550
1546
|
expect(promptCalled).toBe(false);
|
|
1551
1547
|
});
|
|
1552
1548
|
|
|
1553
|
-
test('
|
|
1549
|
+
test('reminder_create forces prompt in private thread', async () => {
|
|
1554
1550
|
checkResultOverride = { decision: 'allow', reason: 'Matched trust rule' };
|
|
1555
1551
|
|
|
1556
1552
|
const executor = new ToolExecutor(makeTrackingPrompter());
|
|
1557
1553
|
const result = await executor.execute(
|
|
1558
|
-
'
|
|
1559
|
-
{
|
|
1554
|
+
'reminder_create',
|
|
1555
|
+
{ fire_at: '2030-01-01T00:00:00Z', label: 'test', message: 'test reminder' },
|
|
1560
1556
|
makeContext({ forcePromptSideEffects: true }),
|
|
1561
1557
|
);
|
|
1562
1558
|
|
|
@@ -1564,13 +1560,13 @@ describe('ToolExecutor forcePromptSideEffects enforcement', () => {
|
|
|
1564
1560
|
expect(promptCalled).toBe(true);
|
|
1565
1561
|
});
|
|
1566
1562
|
|
|
1567
|
-
test('
|
|
1563
|
+
test('reminder_list does NOT force prompt in private thread', async () => {
|
|
1568
1564
|
checkResultOverride = { decision: 'allow', reason: 'Matched trust rule' };
|
|
1569
1565
|
|
|
1570
1566
|
const executor = new ToolExecutor(makeTrackingPrompter());
|
|
1571
1567
|
const result = await executor.execute(
|
|
1572
|
-
'
|
|
1573
|
-
{
|
|
1568
|
+
'reminder_list',
|
|
1569
|
+
{},
|
|
1574
1570
|
makeContext({ forcePromptSideEffects: true }),
|
|
1575
1571
|
);
|
|
1576
1572
|
|