@vellumai/assistant 0.8.4 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +2 -2
- package/docs/browser-use-architecture-phase2.md +1 -1
- package/knip.json +2 -1
- package/openapi.yaml +809 -11
- package/package.json +1 -1
- package/src/__tests__/anthropic-provider.test.ts +34 -37
- package/src/__tests__/assistant-event-hub-self-exclusion.test.ts +293 -0
- package/src/__tests__/assistant-feature-flags-integration.test.ts +3 -3
- package/src/__tests__/audit-log-rotation.test.ts +70 -16
- package/src/__tests__/background-workers-disk-pressure.test.ts +3 -3
- package/src/__tests__/btw-routes.test.ts +2 -3
- package/src/__tests__/call-controller.test.ts +0 -1
- package/src/__tests__/cancel-resolves-conversation-key.test.ts +1 -1
- package/src/__tests__/channel-guardian.test.ts +3 -3
- package/src/__tests__/checker.test.ts +6 -15
- package/src/__tests__/compaction-events.test.ts +1 -0
- package/src/__tests__/compactor-call-site-logging.test.ts +214 -0
- package/src/__tests__/computer-use-skill-manifest-regression.test.ts +5 -11
- package/src/__tests__/computer-use-tools.test.ts +2 -4
- package/src/__tests__/confirmation-request-guardian-bridge.test.ts +0 -1
- package/src/__tests__/conversation-agent-loop-disk-pressure.test.ts +1 -1
- package/src/__tests__/conversation-agent-loop-inference-profile.test.ts +1 -1
- package/src/__tests__/conversation-agent-loop-overflow.test.ts +197 -2
- package/src/__tests__/conversation-agent-loop.test.ts +163 -122
- package/src/__tests__/conversation-app-control-instantiation.test.ts +2 -5
- package/src/__tests__/conversation-clear-safety.test.ts +25 -25
- package/src/__tests__/conversation-delete-schedule-cleanup.test.ts +1 -1
- package/src/__tests__/conversation-disk-view-integration.test.ts +2 -2
- package/src/__tests__/conversation-error.test.ts +31 -0
- package/src/__tests__/conversation-fork-crud.test.ts +178 -15
- package/src/__tests__/conversation-lifecycle.test.ts +52 -11
- package/src/__tests__/{conversation-load-cleaned-at.test.ts → conversation-load-history-stripped.test.ts} +13 -13
- package/src/__tests__/conversation-provider-retry-repair.test.ts +1 -0
- package/src/__tests__/conversation-routes-disk-view.test.ts +109 -0
- package/src/__tests__/conversation-routes-slash-commands.test.ts +35 -0
- package/src/__tests__/conversation-skill-tools.test.ts +2 -5
- package/src/__tests__/conversation-store.test.ts +1 -1
- package/src/__tests__/conversation-sync-tags.test.ts +99 -32
- package/src/__tests__/conversation-workspace-cache-state.test.ts +1 -0
- package/src/__tests__/conversation-workspace-injection.test.ts +1 -1
- package/src/__tests__/conversation-workspace-tool-tracking.test.ts +1 -1
- package/src/__tests__/credential-execution-feature-gates.test.ts +9 -7
- package/src/__tests__/credential-execution-tools.test.ts +6 -6
- package/src/__tests__/credential-security-invariants.test.ts +1 -0
- package/src/__tests__/credential-vault-unit.test.ts +2 -2
- package/src/__tests__/dynamic-page-surface.test.ts +2 -2
- package/src/__tests__/email-html-renderer.test.ts +12 -0
- package/src/__tests__/gateway-flag-listener.test.ts +237 -0
- package/src/__tests__/gemini-provider.test.ts +78 -0
- package/src/__tests__/guardian-dispatch.test.ts +0 -1
- package/src/__tests__/guardian-outbound-http.test.ts +7 -5
- package/src/__tests__/handlers-user-message-approval-consumption.test.ts +1 -1
- package/src/__tests__/heartbeat-disk-pressure.test.ts +4 -0
- package/src/__tests__/heartbeat-service.test.ts +4 -0
- package/src/__tests__/host-shell-tool.test.ts +1 -1
- package/src/__tests__/init-feature-flag-overrides.test.ts +5 -6
- package/src/__tests__/list-messages-tool-merge.test.ts +70 -11
- package/src/__tests__/llm-request-log-call-site.test.ts +136 -0
- package/src/__tests__/llm-request-log-source-clickhouse.test.ts +26 -0
- package/src/__tests__/llm-resolver.test.ts +77 -9
- package/src/__tests__/llm-usage-store.test.ts +66 -0
- package/src/__tests__/logger.test.ts +89 -0
- package/src/__tests__/mcp-abort-signal.test.ts +2 -2
- package/src/__tests__/media-generate-image.test.ts +31 -0
- package/src/__tests__/memory-v2-static-injector.test.ts +7 -7
- package/src/__tests__/model-intents.test.ts +2 -4
- package/src/__tests__/notification-guardian-path.test.ts +0 -1
- package/src/__tests__/onboarding-template-contract.test.ts +1 -1
- package/src/__tests__/openai-provider.test.ts +46 -0
- package/src/__tests__/openai-responses-provider.test.ts +114 -12
- package/src/__tests__/pending-interactions-resolved-event.test.ts +0 -1
- package/src/__tests__/platform-bash-auto-approve.test.ts +2 -2
- package/src/__tests__/platform.test.ts +2 -2
- package/src/__tests__/plugin-api-tool-definition.test.ts +92 -0
- package/src/__tests__/plugin-bootstrap.test.ts +2 -2
- package/src/__tests__/plugin-tool-contribution.test.ts +13 -6
- package/src/__tests__/plugin-types.test.ts +3 -2
- package/src/__tests__/prechat-onboarding-contract.test.ts +131 -98
- package/src/__tests__/pricing.test.ts +12 -0
- package/src/__tests__/prune-jobs-changes-parser.test.ts +61 -0
- package/src/__tests__/registry.test.ts +2 -8
- package/src/__tests__/require-fresh-approval.test.ts +2 -2
- package/src/__tests__/runtime-events-sse-bilingual.test.ts +154 -0
- package/src/__tests__/shell-tool-proxy-mode.test.ts +1 -1
- package/src/__tests__/skill-feature-flags.test.ts +2 -2
- package/src/__tests__/skill-projection-feature-flag.test.ts +4 -7
- package/src/__tests__/skill-projection.benchmark.test.ts +2 -6
- package/src/__tests__/skill-tool-factory.test.ts +1 -1
- package/src/__tests__/subagent-notify-parent.test.ts +1 -1
- package/src/__tests__/suggestion-routes.test.ts +1 -0
- package/src/__tests__/sync-message-contract.test.ts +59 -0
- package/src/__tests__/system-prompt.test.ts +145 -131
- package/src/__tests__/terminal-tools.test.ts +1 -1
- package/src/__tests__/tool-approval-handler.test.ts +1 -5
- package/src/__tests__/tool-execute-pipeline.test.ts +2 -2
- package/src/__tests__/tool-execution-pipeline.benchmark.test.ts +2 -5
- package/src/__tests__/tool-executor-lifecycle-events.test.ts +15 -5
- package/src/__tests__/tool-executor.test.ts +9 -62
- package/src/__tests__/tool-grant-request-escalation.test.ts +1 -6
- package/src/__tests__/trusted-contact-approval-notifier.test.ts +0 -1
- package/src/__tests__/trusted-contact-inline-approval-integration.test.ts +1 -6
- package/src/__tests__/trusted-contact-multichannel.test.ts +0 -1
- package/src/__tests__/ui-file-upload-surface.test.ts +2 -2
- package/src/__tests__/usage-routes.test.ts +3 -0
- package/src/__tests__/verification-control-plane-policy.test.ts +2 -2
- package/src/__tests__/workspace-git-service.test.ts +6 -5
- package/src/__tests__/workspace-migration-089-move-memory-tree-out-of-v3.test.ts +86 -0
- package/src/acp/__tests__/prepare-agent-env.test.ts +146 -0
- package/src/acp/prepare-agent-env.ts +78 -0
- package/src/acp/session-manager.ts +1 -1
- package/src/agent/loop.ts +8 -0
- package/src/api/README.md +5 -0
- package/src/api/index.ts +4 -0
- package/src/api/package.json +10 -0
- package/src/background-wake/background-wake-routes.test.ts +233 -0
- package/src/background-wake/runtime-registry.ts +24 -0
- package/src/cli/commands/__tests__/browser.test.ts +23 -5
- package/src/cli/commands/__tests__/domain-register.test.ts +110 -0
- package/src/cli/commands/__tests__/domain-status.test.ts +33 -33
- package/src/cli/commands/__tests__/inference-send.test.ts +108 -5
- package/src/cli/commands/__tests__/memory-v2-compare-render.test.ts +98 -0
- package/src/cli/commands/__tests__/memory-v2.test.ts +1 -0
- package/src/cli/commands/__tests__/memory-v3-render.test.ts +340 -0
- package/src/cli/commands/browser.ts +247 -0
- package/src/cli/commands/domain.ts +91 -41
- package/src/cli/commands/inference.ts +93 -40
- package/src/cli/commands/memory-v2-compare-render.ts +115 -0
- package/src/cli/commands/memory-v2.ts +176 -1
- package/src/cli/commands/memory-v3-render.ts +344 -0
- package/src/cli/commands/memory-v3.ts +316 -0
- package/src/cli/program.ts +2 -0
- package/src/config/assistant-feature-flags.ts +21 -9
- package/src/config/bundled-skills/document-editor/SKILL.md +11 -2
- package/src/config/bundled-skills/document-editor/TOOLS.json +18 -0
- package/src/config/bundled-skills/document-editor/tools/document-open.ts +12 -0
- package/src/config/bundled-skills/image-studio/SKILL.md +4 -0
- package/src/config/bundled-skills/image-studio/tools/media-generate-image.ts +2 -2
- package/src/config/bundled-skills/media-processing/tools/ingest-media.ts +13 -8
- package/src/config/bundled-skills/messaging/tools/messaging-analyze-style.ts +10 -3
- package/src/config/bundled-skills/phone-calls/references/TRANSCRIPTS.md +16 -14
- package/src/config/bundled-skills/playbooks/tools/playbook-create.ts +7 -2
- package/src/config/bundled-skills/playbooks/tools/playbook-update.ts +7 -2
- package/src/config/bundled-tool-registry.ts +2 -0
- package/src/config/call-site-defaults.ts +7 -6
- package/src/config/feature-flag-registry.json +16 -0
- package/src/config/schemas/__tests__/memory-v2.test.ts +213 -1
- package/src/config/schemas/call-site-catalog.ts +21 -7
- package/src/config/schemas/llm.ts +12 -1
- package/src/config/schemas/memory-v2.ts +246 -0
- package/src/config/schemas/memory.ts +2 -1
- package/src/context/compactor.ts +52 -0
- package/src/conversations/__tests__/message-consolidation.test.ts +350 -0
- package/src/conversations/message-consolidation.ts +404 -0
- package/src/daemon/__tests__/conversation-tool-setup-exclude.test.ts +1 -1
- package/src/daemon/__tests__/meet-manifest-loader.test.ts +1 -1
- package/src/daemon/conversation-agent-loop-handlers.ts +2 -13
- package/src/daemon/conversation-agent-loop.ts +126 -76
- package/src/daemon/conversation-error.ts +31 -1
- package/src/daemon/conversation-lifecycle.ts +27 -22
- package/src/daemon/conversation-runtime-assembly.ts +10 -9
- package/src/daemon/conversation-tool-setup.ts +63 -3
- package/src/daemon/conversation-usage.ts +2 -0
- package/src/daemon/conversation.ts +14 -29
- package/src/daemon/disk-pressure-guard.ts +14 -2
- package/src/daemon/handlers/config-model.test.ts +1 -0
- package/src/daemon/handlers/conversations.ts +11 -3
- package/src/daemon/host-browser-proxy.ts +5 -5
- package/src/daemon/host-cu-proxy.ts +4 -4
- package/src/daemon/host-file-proxy.ts +4 -4
- package/src/daemon/host-proxy-base.ts +4 -4
- package/src/daemon/host-transfer-proxy.ts +10 -10
- package/src/daemon/lifecycle.ts +23 -20
- package/src/daemon/meet-manifest-loader.ts +1 -7
- package/src/daemon/message-types/conversations.ts +6 -9
- package/src/daemon/message-types/home.ts +1 -13
- package/src/daemon/message-types/messages.ts +6 -14
- package/src/daemon/message-types/sync.ts +14 -0
- package/src/daemon/shutdown-handlers.ts +24 -5
- package/src/daemon/switch-inference-profile-tool.ts +52 -0
- package/src/daemon/tool-setup-types.ts +13 -0
- package/src/events/relationship-state-updated.ts +25 -0
- package/src/heartbeat/__tests__/heartbeat-service.test.ts +1 -1
- package/src/home/home-greeting.ts +0 -9
- package/src/home/suggested-prompts.ts +0 -9
- package/src/ipc/gateway-flag-listener.ts +123 -0
- package/src/ipc/skill-routes/registries.ts +8 -12
- package/src/memory/__tests__/db-async-query.test.ts +165 -0
- package/src/memory/__tests__/db-maintenance.test.ts +115 -0
- package/src/memory/__tests__/jobs-store-enqueue-gate.test.ts +241 -0
- package/src/memory/__tests__/jobs-store-job-classes.test.ts +28 -1
- package/src/memory/__tests__/memory-retrospective-job.test.ts +7 -0
- package/src/memory/auto-analysis-enqueue.ts +5 -1
- package/src/memory/conversation-crud.ts +71 -70
- package/src/memory/conversation-starters-cadence.ts +3 -1
- package/src/memory/conversation-title-service.ts +19 -3
- package/src/memory/db-async-query.ts +214 -0
- package/src/memory/db-init.ts +10 -0
- package/src/memory/db-maintenance.ts +30 -21
- package/src/memory/graph/bootstrap.ts +8 -1
- package/src/memory/graph/capability-seed.ts +7 -3
- package/src/memory/graph/conversation-graph-memory.ts +100 -17
- package/src/memory/graph/extraction.ts +1 -5
- package/src/memory/graph/graph-search.ts +7 -1
- package/src/memory/indexer.ts +28 -18
- package/src/memory/job-handlers/cleanup.ts +76 -18
- package/src/memory/job-handlers/conversation-starters.ts +1 -4
- package/src/memory/jobs/embed-pkb-file.ts +6 -1
- package/src/memory/jobs-store.ts +14 -0
- package/src/memory/jobs-worker.ts +55 -22
- package/src/memory/llm-request-log-source-clickhouse.ts +42 -2
- package/src/memory/llm-request-log-source-local.ts +7 -0
- package/src/memory/llm-request-log-source.ts +9 -2
- package/src/memory/llm-request-log-store.ts +43 -1
- package/src/memory/llm-usage-store.ts +24 -0
- package/src/memory/memory-retrospective-enqueue.ts +8 -1
- package/src/memory/memory-retrospective-job.ts +5 -0
- package/src/memory/memory-v2-activation-log-store.ts +15 -6
- package/src/memory/migrations/260-rename-cleaned-at.ts +44 -0
- package/src/memory/migrations/261-llm-usage-add-raw-usage.ts +36 -0
- package/src/memory/migrations/262-memory-v3-coactivation.ts +57 -0
- package/src/memory/migrations/263-memory-v3-auto-edges.ts +50 -0
- package/src/memory/migrations/264-llm-request-log-call-site.ts +29 -0
- package/src/memory/migrations/index.ts +17 -0
- package/src/memory/migrations/registry.ts +33 -0
- package/src/memory/schema/conversations.ts +1 -1
- package/src/memory/schema/infrastructure.ts +21 -0
- package/src/memory/tool-usage-store.ts +36 -8
- package/src/memory/v2/__tests__/consolidation-job.test.ts +1 -0
- package/src/memory/v2/__tests__/harness-compare.test.ts +186 -0
- package/src/memory/v2/__tests__/harness-metrics.test.ts +74 -0
- package/src/memory/v2/__tests__/harness-oracle.test.ts +257 -0
- package/src/memory/v2/__tests__/harness-replay-input.test.ts +225 -0
- package/src/memory/v2/__tests__/harness-runner.test.ts +109 -0
- package/src/memory/v2/__tests__/injection.test.ts +127 -98
- package/src/memory/v2/__tests__/qdrant.test.ts +36 -0
- package/src/memory/v2/__tests__/router.test.ts +171 -3
- package/src/memory/v2/harness/compare.ts +57 -0
- package/src/memory/v2/harness/metrics.ts +124 -0
- package/src/memory/v2/harness/oracle.ts +145 -0
- package/src/memory/v2/harness/replay-input.ts +224 -0
- package/src/memory/v2/harness/retriever.ts +74 -0
- package/src/memory/v2/harness/router-retriever.ts +43 -0
- package/src/memory/v2/harness/runner.ts +106 -0
- package/src/memory/v2/harness/trace.ts +58 -0
- package/src/memory/v2/injection.ts +21 -15
- package/src/memory/v2/prompts/router.ts +26 -1
- package/src/memory/v2/qdrant.ts +14 -2
- package/src/memory/v2/router.ts +171 -18
- package/src/memory/v3/__tests__/coactivation-store.test.ts +422 -0
- package/src/memory/v3/__tests__/consolidation-job.test.ts +468 -0
- package/src/memory/v3/__tests__/edge-learning-job.test.ts +324 -0
- package/src/memory/v3/__tests__/edges.test.ts +563 -0
- package/src/memory/v3/__tests__/filter.test.ts +512 -0
- package/src/memory/v3/__tests__/gate.test.ts +574 -0
- package/src/memory/v3/__tests__/index-composition.test.ts +233 -0
- package/src/memory/v3/__tests__/loop.test.ts +530 -0
- package/src/memory/v3/__tests__/retriever.test.ts +226 -0
- package/src/memory/v3/__tests__/scouts.test.ts +440 -0
- package/src/memory/v3/__tests__/shadow-middleware.test.ts +312 -0
- package/src/memory/v3/__tests__/system-prompts.test.ts +154 -0
- package/src/memory/v3/__tests__/traversal.test.ts +469 -0
- package/src/memory/v3/__tests__/tree-index.test.ts +280 -0
- package/src/memory/v3/__tests__/tree-store.test.ts +529 -0
- package/src/memory/v3/__tests__/tree-walk.test.ts +707 -0
- package/src/memory/v3/__tests__/validate.test.ts +245 -0
- package/src/memory/v3/auto-edges.ts +223 -0
- package/src/memory/v3/coactivation-store.ts +124 -0
- package/src/memory/v3/consolidation-job.ts +323 -0
- package/src/memory/v3/edge-learning-job.ts +160 -0
- package/src/memory/v3/edges.ts +249 -0
- package/src/memory/v3/filter.ts +281 -0
- package/src/memory/v3/gate.ts +334 -0
- package/src/memory/v3/index-composition.ts +113 -0
- package/src/memory/v3/llm-capture.ts +46 -0
- package/src/memory/v3/loop.ts +382 -0
- package/src/memory/v3/maintenance.ts +144 -0
- package/src/memory/v3/prompt-context.ts +33 -0
- package/src/memory/v3/prompts/consolidation.ts +458 -0
- package/src/memory/v3/prompts/system-prompts.ts +196 -0
- package/src/memory/v3/retriever.ts +33 -0
- package/src/memory/v3/scouts.ts +420 -0
- package/src/memory/v3/shadow-middleware.ts +305 -0
- package/src/memory/v3/traversal.ts +206 -0
- package/src/memory/v3/tree-index.ts +237 -0
- package/src/memory/v3/tree-store.ts +394 -0
- package/src/memory/v3/tree-walk.ts +351 -0
- package/src/memory/v3/types.ts +65 -0
- package/src/memory/v3/validate.ts +300 -0
- package/src/notifications/adapters/macos.ts +18 -1
- package/src/notifications/adapters/platform.ts +1 -1
- package/src/notifications/decision-engine.ts +1 -4
- package/src/notifications/emit-signal.ts +29 -49
- package/src/permissions/prompter.ts +3 -3
- package/src/permissions/question-prompter.ts +5 -2
- package/src/permissions/secret-prompter.ts +2 -2
- package/src/plugin-api/index.ts +4 -0
- package/src/plugin-api/types.ts +7 -33
- package/src/plugins/defaults/index.ts +6 -0
- package/src/plugins/defaults/injectors.ts +18 -11
- package/src/plugins/external-plugin-loader.ts +5 -68
- package/src/plugins/types.ts +11 -16
- package/src/proactive-artifact/aux-message-injector.ts +17 -4
- package/src/prompts/__tests__/task-progress-hint-section.test.ts +3 -9
- package/src/prompts/persona-resolver.ts +36 -21
- package/src/prompts/sections.ts +39 -7
- package/src/prompts/system-prompt.ts +50 -185
- package/src/prompts/templates/BOOTSTRAP.md +2 -2
- package/src/prompts/templates/system-sections.ts +230 -8
- package/src/providers/__tests__/connection-model-compat.test.ts +234 -0
- package/src/providers/__tests__/retry-callsite.test.ts +85 -5
- package/src/providers/anthropic/client.ts +32 -66
- package/src/providers/call-site-routing.ts +14 -2
- package/src/providers/connection-model-compat.ts +38 -0
- package/src/providers/connection-resolution.ts +16 -2
- package/src/providers/gemini/client.ts +49 -6
- package/src/providers/inference/adapter-factory.ts +3 -0
- package/src/providers/minimax/client.ts +106 -0
- package/src/providers/model-catalog.ts +43 -0
- package/src/providers/model-intents.ts +1 -1
- package/src/providers/openai/chat-completions-provider.ts +6 -3
- package/src/providers/openai/codex-models.ts +18 -0
- package/src/providers/openai/responses-provider.ts +78 -21
- package/src/providers/provider-send-message.ts +7 -1
- package/src/providers/retry.ts +34 -3
- package/src/providers/thinking-config.ts +26 -1
- package/src/providers/usage-tracking.ts +2 -0
- package/src/runtime/AGENTS.md +2 -2
- package/src/runtime/agent-wake.ts +1 -0
- package/src/runtime/assistant-event-hub.ts +76 -6
- package/src/runtime/auth/route-policy.ts +36 -0
- package/src/runtime/btw-sidechain.ts +0 -6
- package/src/runtime/http-types.ts +0 -2
- package/src/runtime/migrations/vbundle-builder.ts +10 -3
- package/src/runtime/pending-interactions.ts +0 -1
- package/src/runtime/routes/__tests__/conversation-query-routes.test.ts +106 -0
- package/src/runtime/routes/__tests__/memory-v2-simulate-route.test.ts +25 -6
- package/src/runtime/routes/__tests__/plugins-routes.test.ts +512 -0
- package/src/runtime/routes/acp-routes.test.ts +255 -6
- package/src/runtime/routes/acp-routes.ts +8 -1
- package/src/runtime/routes/avatar-routes.ts +10 -10
- package/src/runtime/routes/background-wake-routes.ts +188 -0
- package/src/runtime/routes/browser-tabs-routes.ts +200 -0
- package/src/runtime/routes/btw-routes.ts +0 -6
- package/src/runtime/routes/conversation-cli-routes.ts +1 -1
- package/src/runtime/routes/conversation-list-routes.ts +12 -4
- package/src/runtime/routes/conversation-management-routes.ts +77 -20
- package/src/runtime/routes/conversation-query-routes.ts +142 -36
- package/src/runtime/routes/conversation-routes.ts +252 -410
- package/src/runtime/routes/conversation-starter-routes.ts +6 -3
- package/src/runtime/routes/disk-pressure-routes.ts +1 -1
- package/src/runtime/routes/domain-routes.ts +60 -10
- package/src/runtime/routes/email-routes.ts +5 -2
- package/src/runtime/routes/events-routes.ts +54 -10
- package/src/runtime/routes/group-routes.ts +24 -8
- package/src/runtime/routes/host-browser-routes.ts +10 -2
- package/src/runtime/routes/host-cu-routes.ts +2 -2
- package/src/runtime/routes/inbound-stages/acl-enforcement.ts +96 -3
- package/src/runtime/routes/index.ts +8 -0
- package/src/runtime/routes/inference-profile-session-handler.ts +22 -12
- package/src/runtime/routes/inference-profile-session-routes.ts +7 -1
- package/src/runtime/routes/llm-call-sites-routes.ts +32 -5
- package/src/runtime/routes/memory-item-routes.ts +8 -3
- package/src/runtime/routes/memory-v2-routes.ts +215 -5
- package/src/runtime/routes/memory-v3-routes.ts +316 -0
- package/src/runtime/routes/migration-routes.ts +21 -24
- package/src/runtime/routes/plugins-routes.ts +337 -0
- package/src/runtime/routes/rename-conversation-routes.ts +6 -2
- package/src/runtime/routes/secret-routes.ts +25 -5
- package/src/runtime/routes/settings-routes.ts +12 -11
- package/src/runtime/routes/slack-channel-routes.ts +5 -4
- package/src/runtime/routes/workspace-routes.ts +25 -10
- package/src/runtime/sync/resource-sync-events.ts +106 -38
- package/src/runtime/sync/sync-publisher.test.ts +49 -0
- package/src/runtime/sync/sync-publisher.ts +2 -1
- package/src/runtime/verification-outbound-actions.ts +73 -1
- package/src/telemetry/types.ts +12 -0
- package/src/telemetry/usage-telemetry-reporter.test.ts +48 -0
- package/src/telemetry/usage-telemetry-reporter.ts +1 -0
- package/src/tools/acp/spawn.test.ts +119 -0
- package/src/tools/acp/spawn.ts +15 -2
- package/src/tools/apps/definitions.ts +2 -8
- package/src/tools/ask-question/ask-question-tool.test.ts +3 -3
- package/src/tools/ask-question/ask-question-tool.ts +38 -45
- package/src/tools/browser/__tests__/pinned-tabs.test.ts +70 -0
- package/src/tools/browser/browser-execution.ts +16 -3
- package/src/tools/browser/cdp-client/__tests__/browser-tabs-factory.test.ts +402 -0
- package/src/tools/browser/cdp-client/__tests__/types.test.ts +3 -0
- package/src/tools/browser/cdp-client/cdp-inspect-client.ts +12 -0
- package/src/tools/browser/cdp-client/extension-cdp-client.ts +27 -1
- package/src/tools/browser/cdp-client/factory.ts +100 -17
- package/src/tools/browser/cdp-client/local-cdp-client.ts +12 -0
- package/src/tools/browser/cdp-client/types.ts +65 -0
- package/src/tools/browser/pinned-tabs.ts +96 -40
- package/src/tools/computer-use/definitions.ts +22 -78
- package/src/tools/credential-execution/make-authenticated-request.ts +3 -9
- package/src/tools/credential-execution/manage-secure-command-tool.ts +3 -9
- package/src/tools/credential-execution/run-authenticated-command.ts +3 -9
- package/src/tools/credentials/vault.ts +3 -9
- package/src/tools/document/document-tool.ts +59 -0
- package/src/tools/execution-target.ts +21 -23
- package/src/tools/executor.ts +6 -1
- package/src/tools/filesystem/edit.ts +3 -9
- package/src/tools/filesystem/list.ts +3 -9
- package/src/tools/filesystem/read.ts +3 -9
- package/src/tools/filesystem/write.ts +3 -9
- package/src/tools/host-filesystem/edit.ts +3 -9
- package/src/tools/host-filesystem/read.ts +3 -9
- package/src/tools/host-filesystem/transfer.ts +3 -9
- package/src/tools/host-filesystem/write.ts +3 -9
- package/src/tools/host-terminal/host-shell.ts +3 -9
- package/src/tools/mcp/mcp-tool-factory.ts +1 -8
- package/src/tools/memory/register.test.ts +1 -1
- package/src/tools/memory/register.ts +4 -9
- package/src/tools/network/web-fetch.ts +3 -9
- package/src/tools/network/web-search.ts +25 -32
- package/src/tools/registry.ts +7 -23
- package/src/tools/schema-transforms.ts +1 -1
- package/src/tools/skills/execute.ts +3 -9
- package/src/tools/skills/load.ts +3 -9
- package/src/tools/skills/skill-tool-factory.ts +1 -8
- package/src/tools/subagent/notify-parent.ts +3 -9
- package/src/tools/system/request-permission.ts +3 -9
- package/src/tools/terminal/shell.ts +3 -9
- package/src/tools/tool-defaults.ts +94 -0
- package/src/tools/types.ts +27 -98
- package/src/tools/ui-surface/definitions.ts +6 -22
- package/src/usage/pricing.ts +23 -0
- package/src/usage/types.ts +12 -0
- package/src/util/logger.ts +16 -7
- package/src/util/platform.ts +7 -2
- package/src/util/sqlite3-runtime.ts +65 -0
- package/src/workspace/migrations/086-revert-stale-gemini-mis-rewrites.ts +1 -0
- package/src/workspace/migrations/089-move-memory-tree-out-of-v3.ts +86 -0
- package/src/workspace/migrations/registry.ts +2 -0
- package/src/__tests__/compaction-strip-metadata-clear.test.ts +0 -206
- package/src/__tests__/message-complete-display-id.test.ts +0 -175
- package/src/daemon/query-complexity-router.ts +0 -75
- package/src/prompts/cache-boundary.ts +0 -8
|
@@ -116,7 +116,7 @@ mock.module("../../../providers/provider-send-message.js", () => ({
|
|
|
116
116
|
// them. No mock needed for `daemon/identity-helpers.js`; it tolerates a
|
|
117
117
|
// missing IDENTITY.md by returning null.
|
|
118
118
|
|
|
119
|
-
const { runRouter } = await import("../router.js");
|
|
119
|
+
const { runRouter, applyHistoricalCharBudget } = await import("../router.js");
|
|
120
120
|
const { getPageIndex, invalidatePageIndex } = await import("../page-index.js");
|
|
121
121
|
const { writePage } = await import("../page-store.js");
|
|
122
122
|
|
|
@@ -220,6 +220,7 @@ function makeConfig(overrides?: {
|
|
|
220
220
|
batchSize?: number | null;
|
|
221
221
|
tier1Size?: number | null;
|
|
222
222
|
tier2Size?: number | null;
|
|
223
|
+
historicalPairsMaxChars?: number | null;
|
|
223
224
|
}) {
|
|
224
225
|
return {
|
|
225
226
|
memory: {
|
|
@@ -231,6 +232,8 @@ function makeConfig(overrides?: {
|
|
|
231
232
|
batch_size: overrides?.batchSize ?? null,
|
|
232
233
|
tier1_size: overrides?.tier1Size ?? null,
|
|
233
234
|
tier2_size: overrides?.tier2Size ?? null,
|
|
235
|
+
historical_pairs_max_chars:
|
|
236
|
+
overrides?.historicalPairsMaxChars ?? null,
|
|
234
237
|
},
|
|
235
238
|
},
|
|
236
239
|
},
|
|
@@ -238,8 +241,12 @@ function makeConfig(overrides?: {
|
|
|
238
241
|
}
|
|
239
242
|
|
|
240
243
|
const COMMON_PARAMS = {
|
|
241
|
-
|
|
242
|
-
|
|
244
|
+
recentTurnPairs: [
|
|
245
|
+
{
|
|
246
|
+
assistantMessage: "Let me check your plan.",
|
|
247
|
+
userMessage: "What's on my plate today?",
|
|
248
|
+
},
|
|
249
|
+
],
|
|
243
250
|
nowText: "2026-05-10 14:00 PT",
|
|
244
251
|
priorEverInjected: [] as { slug: string; turn: number }[],
|
|
245
252
|
};
|
|
@@ -418,6 +425,78 @@ describe("runRouter — successful tool_use", () => {
|
|
|
418
425
|
expect(blockB.cache_control).toBeUndefined();
|
|
419
426
|
});
|
|
420
427
|
|
|
428
|
+
test("runRouterBatch front-truncates the oldest <last_turn> message when the char budget is exceeded", async () => {
|
|
429
|
+
await writePage(workspaceDir, makePage("alpha", { summary: "A" }));
|
|
430
|
+
providerStub = makeProvider(toolUseResponse([1]));
|
|
431
|
+
|
|
432
|
+
const longAssistant = "A".repeat(2_000);
|
|
433
|
+
const longUser = "B".repeat(2_000);
|
|
434
|
+
const recentAssistant = "Short prior.";
|
|
435
|
+
const justArrived = "What's relevant?";
|
|
436
|
+
|
|
437
|
+
await runRouter({
|
|
438
|
+
workspaceDir,
|
|
439
|
+
recentTurnPairs: [
|
|
440
|
+
{ assistantMessage: longAssistant, userMessage: longUser },
|
|
441
|
+
{ assistantMessage: recentAssistant, userMessage: justArrived },
|
|
442
|
+
],
|
|
443
|
+
nowText: "now",
|
|
444
|
+
priorEverInjected: [],
|
|
445
|
+
// Budget: just enough room for the most-recent pair plus the old user
|
|
446
|
+
// line in full, leaving a small slice for the very oldest assistant
|
|
447
|
+
// (which should be front-truncated with the `…` marker).
|
|
448
|
+
config: makeConfig({
|
|
449
|
+
historicalPairsMaxChars:
|
|
450
|
+
recentAssistant.length + justArrived.length + longUser.length + 50,
|
|
451
|
+
}),
|
|
452
|
+
});
|
|
453
|
+
|
|
454
|
+
const [call] = providerCalls;
|
|
455
|
+
const userMsg = call.messages[0];
|
|
456
|
+
const blockB = userMsg.content[1] as { text: string };
|
|
457
|
+
|
|
458
|
+
// The just-arrived user message and the prior assistant reply survive
|
|
459
|
+
// verbatim because they're newest in the walk.
|
|
460
|
+
expect(blockB.text).toContain(`[user]: ${justArrived}`);
|
|
461
|
+
expect(blockB.text).toContain(`[assistant]: ${recentAssistant}`);
|
|
462
|
+
|
|
463
|
+
// The older user message survives verbatim (next newest after the
|
|
464
|
+
// most-recent pair).
|
|
465
|
+
expect(blockB.text).toContain(`[user]: ${longUser}`);
|
|
466
|
+
|
|
467
|
+
// The oldest message in the walk (the older assistant) is
|
|
468
|
+
// front-truncated, so its rendered line starts with the `…` marker
|
|
469
|
+
// and ends with the suffix of the original text.
|
|
470
|
+
expect(blockB.text).toContain("[assistant]: …");
|
|
471
|
+
expect(blockB.text.endsWith(`A\n</last_turn>`)).toBe(false); // sanity
|
|
472
|
+
// The full untruncated long-assistant string must NOT appear.
|
|
473
|
+
expect(blockB.text.includes(longAssistant)).toBe(false);
|
|
474
|
+
// The TAIL of the long-assistant string SHOULD appear (kept from front-truncation).
|
|
475
|
+
expect(blockB.text).toContain(longAssistant.slice(-10));
|
|
476
|
+
});
|
|
477
|
+
|
|
478
|
+
test("null historical_pairs_max_chars renders pairs verbatim regardless of size", async () => {
|
|
479
|
+
await writePage(workspaceDir, makePage("alpha", { summary: "A" }));
|
|
480
|
+
providerStub = makeProvider(toolUseResponse([1]));
|
|
481
|
+
|
|
482
|
+
const huge = "X".repeat(5_000);
|
|
483
|
+
await runRouter({
|
|
484
|
+
workspaceDir,
|
|
485
|
+
recentTurnPairs: [
|
|
486
|
+
{ assistantMessage: huge, userMessage: "just arrived" },
|
|
487
|
+
],
|
|
488
|
+
nowText: "now",
|
|
489
|
+
priorEverInjected: [],
|
|
490
|
+
config: makeConfig(), // historical_pairs_max_chars: null
|
|
491
|
+
});
|
|
492
|
+
|
|
493
|
+
const [call] = providerCalls;
|
|
494
|
+
const blockB = call.messages[0].content[1] as { text: string };
|
|
495
|
+
expect(blockB.text).toContain(`[assistant]: ${huge}`);
|
|
496
|
+
expect(blockB.text).toContain("[user]: just arrived");
|
|
497
|
+
expect(blockB.text).not.toContain("…");
|
|
498
|
+
});
|
|
499
|
+
|
|
421
500
|
test("de-duplicates repeated IDs from the model while preserving order", async () => {
|
|
422
501
|
providerStub = makeProvider(toolUseResponse([2, 1, 2]));
|
|
423
502
|
|
|
@@ -1017,3 +1096,92 @@ describe("runRouter — tier 2 (highest EMA)", () => {
|
|
|
1017
1096
|
expect(warned).toBe(true);
|
|
1018
1097
|
});
|
|
1019
1098
|
});
|
|
1099
|
+
|
|
1100
|
+
// ---------------------------------------------------------------------------
|
|
1101
|
+
// applyHistoricalCharBudget — pure helper covering the cap semantics.
|
|
1102
|
+
// ---------------------------------------------------------------------------
|
|
1103
|
+
|
|
1104
|
+
describe("applyHistoricalCharBudget", () => {
|
|
1105
|
+
test("null budget is a no-op (returns a shallow copy)", () => {
|
|
1106
|
+
const pairs = [
|
|
1107
|
+
{ assistantMessage: "older asst", userMessage: "older user" },
|
|
1108
|
+
{ assistantMessage: "newer asst", userMessage: "newer user" },
|
|
1109
|
+
];
|
|
1110
|
+
const out = applyHistoricalCharBudget(pairs, null);
|
|
1111
|
+
expect(out).toEqual(pairs);
|
|
1112
|
+
// shallow copy — not the same array reference, so callers can mutate freely
|
|
1113
|
+
expect(out).not.toBe(pairs);
|
|
1114
|
+
});
|
|
1115
|
+
|
|
1116
|
+
test("budget that fits every message returns content unchanged", () => {
|
|
1117
|
+
const pairs = [
|
|
1118
|
+
{ assistantMessage: "AA", userMessage: "UU" },
|
|
1119
|
+
{ assistantMessage: "BB", userMessage: "VV" },
|
|
1120
|
+
];
|
|
1121
|
+
const total = "AA".length + "UU".length + "BB".length + "VV".length; // 8
|
|
1122
|
+
const out = applyHistoricalCharBudget(pairs, total);
|
|
1123
|
+
expect(out).toEqual(pairs);
|
|
1124
|
+
});
|
|
1125
|
+
|
|
1126
|
+
test("front-truncates the oldest still-includable message when the cap is exceeded", () => {
|
|
1127
|
+
// Newest user is 10 chars, newest assistant is 10, older user is 10,
|
|
1128
|
+
// older assistant is 20. Budget 35 leaves remaining = 35 - 10 - 10 - 10 = 5
|
|
1129
|
+
// for the older assistant; 5 - 1 marker char = 4 kept chars from the END.
|
|
1130
|
+
const pairs = [
|
|
1131
|
+
{ assistantMessage: "ABCDEFGHIJKLMNOPQRST", userMessage: "old-user--" },
|
|
1132
|
+
{ assistantMessage: "abcdefghij", userMessage: "uvwxyzUVWX" },
|
|
1133
|
+
];
|
|
1134
|
+
const out = applyHistoricalCharBudget(pairs, 35);
|
|
1135
|
+
expect(out).toEqual([
|
|
1136
|
+
{ assistantMessage: "…QRST", userMessage: "old-user--" },
|
|
1137
|
+
{ assistantMessage: "abcdefghij", userMessage: "uvwxyzUVWX" },
|
|
1138
|
+
]);
|
|
1139
|
+
// Sanity: total content chars equals the budget.
|
|
1140
|
+
const totalChars = out.reduce(
|
|
1141
|
+
(acc, p) => acc + p.assistantMessage.length + p.userMessage.length,
|
|
1142
|
+
0,
|
|
1143
|
+
);
|
|
1144
|
+
expect(totalChars).toBe(35);
|
|
1145
|
+
});
|
|
1146
|
+
|
|
1147
|
+
test("drops older pairs entirely when even their first message has no room", () => {
|
|
1148
|
+
// Budget 20 fits the most-recent pair exactly (10 + 10 = 20) and leaves
|
|
1149
|
+
// zero room for the older pair, which is dropped entirely.
|
|
1150
|
+
const pairs = [
|
|
1151
|
+
{ assistantMessage: "OLD-ASST00", userMessage: "OLD-USER00" },
|
|
1152
|
+
{ assistantMessage: "NEW-ASST00", userMessage: "NEW-USER00" },
|
|
1153
|
+
];
|
|
1154
|
+
const out = applyHistoricalCharBudget(pairs, 20);
|
|
1155
|
+
expect(out).toEqual([
|
|
1156
|
+
{ assistantMessage: "NEW-ASST00", userMessage: "NEW-USER00" },
|
|
1157
|
+
]);
|
|
1158
|
+
});
|
|
1159
|
+
|
|
1160
|
+
test("drops the older message of the current pair when the user line consumes the whole budget", () => {
|
|
1161
|
+
// Budget 10 just barely covers the newest user (10 chars). The pair's
|
|
1162
|
+
// own assistant message has no room and is dropped (left empty).
|
|
1163
|
+
const pairs = [
|
|
1164
|
+
{ assistantMessage: "ASSISTANTX", userMessage: "USER-NEW10" },
|
|
1165
|
+
];
|
|
1166
|
+
const out = applyHistoricalCharBudget(pairs, 10);
|
|
1167
|
+
expect(out).toEqual([{ assistantMessage: "", userMessage: "USER-NEW10" }]);
|
|
1168
|
+
});
|
|
1169
|
+
|
|
1170
|
+
test("non-positive budgets return an empty array (no message survives)", () => {
|
|
1171
|
+
const pairs = [{ assistantMessage: "x", userMessage: "y" }];
|
|
1172
|
+
expect(applyHistoricalCharBudget(pairs, 0)).toEqual(pairs);
|
|
1173
|
+
// Negative budgets are degenerate but should not throw.
|
|
1174
|
+
expect(applyHistoricalCharBudget(pairs, -5)).toEqual(pairs);
|
|
1175
|
+
});
|
|
1176
|
+
|
|
1177
|
+
test("budget smaller than the truncation marker drops the would-truncate message", () => {
|
|
1178
|
+
// Budget 11: covers full newest user (10 chars). Remaining 1 char is not
|
|
1179
|
+
// enough room for the marker, so the next message (newest assistant)
|
|
1180
|
+
// is dropped entirely rather than emitting a marker-only message.
|
|
1181
|
+
const pairs = [
|
|
1182
|
+
{ assistantMessage: "ASSISTANTX", userMessage: "USER-NEW10" },
|
|
1183
|
+
];
|
|
1184
|
+
const out = applyHistoricalCharBudget(pairs, 11);
|
|
1185
|
+
expect(out).toEqual([{ assistantMessage: "", userMessage: "USER-NEW10" }]);
|
|
1186
|
+
});
|
|
1187
|
+
});
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Run the comparison harness over a sample of historical turns.
|
|
3
|
+
*
|
|
4
|
+
* Ties the harness pieces together: pull oracle turns from telemetry, run each
|
|
5
|
+
* retriever over each turn's reconstructed inputs, score against the logged
|
|
6
|
+
* ground truth. Kept separate from the route handler so it can be unit-tested
|
|
7
|
+
* with a stub retriever and a fixture DB — no live router / LLM.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { AssistantConfig } from "../../../config/types.js";
|
|
11
|
+
import type { DrizzleDb } from "../../db-connection.js";
|
|
12
|
+
import { extractOracleTurns } from "./oracle.js";
|
|
13
|
+
import { reconstructInput } from "./replay-input.js";
|
|
14
|
+
import type { Retriever } from "./retriever.js";
|
|
15
|
+
import { type ComparisonReport, runComparison } from "./runner.js";
|
|
16
|
+
|
|
17
|
+
export interface RunComparisonOverHistoryParams {
|
|
18
|
+
db: DrizzleDb;
|
|
19
|
+
workspaceDir: string;
|
|
20
|
+
config: AssistantConfig;
|
|
21
|
+
retrievers: readonly Retriever[];
|
|
22
|
+
ks: number[];
|
|
23
|
+
limit?: number;
|
|
24
|
+
strategy?: "recent" | "random";
|
|
25
|
+
conversationIds?: string[];
|
|
26
|
+
includeNotInjected?: boolean;
|
|
27
|
+
pageExists?: (slug: string) => boolean;
|
|
28
|
+
signal?: AbortSignal;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export async function runComparisonOverHistory(
|
|
32
|
+
params: RunComparisonOverHistoryParams,
|
|
33
|
+
): Promise<ComparisonReport> {
|
|
34
|
+
const { db, workspaceDir, config } = params;
|
|
35
|
+
|
|
36
|
+
const oracleTurns = extractOracleTurns(db, {
|
|
37
|
+
...(params.limit !== undefined ? { limit: params.limit } : {}),
|
|
38
|
+
...(params.strategy !== undefined ? { strategy: params.strategy } : {}),
|
|
39
|
+
...(params.conversationIds !== undefined
|
|
40
|
+
? { conversationIds: params.conversationIds }
|
|
41
|
+
: {}),
|
|
42
|
+
...(params.includeNotInjected !== undefined
|
|
43
|
+
? { includeNotInjected: params.includeNotInjected }
|
|
44
|
+
: {}),
|
|
45
|
+
...(params.pageExists !== undefined
|
|
46
|
+
? { pageExists: params.pageExists }
|
|
47
|
+
: {}),
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
return runComparison({
|
|
51
|
+
retrievers: params.retrievers,
|
|
52
|
+
oracleTurns,
|
|
53
|
+
reconstruct: (turn) => reconstructInput(db, turn, config, workspaceDir),
|
|
54
|
+
ks: params.ks,
|
|
55
|
+
...(params.signal !== undefined ? { signal: params.signal } : {}),
|
|
56
|
+
});
|
|
57
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recall@k and per-lane diff for the comparison harness.
|
|
3
|
+
*
|
|
4
|
+
* Ground truth is the current router's logged selections (see `oracle.ts`). A
|
|
5
|
+
* retriever's "extras" (selected, not in ground truth) are reported as a
|
|
6
|
+
* *diff*, not an error — a better retriever may legitimately surface pages the
|
|
7
|
+
* router missed. recall@k is the primary signal.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { RetrievalOutput } from "./retriever.js";
|
|
11
|
+
|
|
12
|
+
export interface TurnEval {
|
|
13
|
+
groundTruth: string[];
|
|
14
|
+
selected: string[];
|
|
15
|
+
/** Ground-truth slugs the retriever selected (anywhere in its output). */
|
|
16
|
+
hits: string[];
|
|
17
|
+
/** Ground-truth slugs the retriever missed entirely. */
|
|
18
|
+
misses: string[];
|
|
19
|
+
/** Selected slugs not in ground truth — diff, not error. */
|
|
20
|
+
extras: string[];
|
|
21
|
+
/** recall@k for each requested k. */
|
|
22
|
+
recallAtK: Record<number, number>;
|
|
23
|
+
/** Counts of hits grouped by the retriever's source/lane labels. */
|
|
24
|
+
hitsByLane: Record<string, number>;
|
|
25
|
+
costUsd?: number;
|
|
26
|
+
failureReason: string | null;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface AggregateEval {
|
|
30
|
+
turns: number;
|
|
31
|
+
meanRecallAtK: Record<number, number>;
|
|
32
|
+
failureRate: number;
|
|
33
|
+
meanCostUsd?: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* recall@k = |topK(selected) ∩ G| / |G|. An empty ground-truth set is defined
|
|
38
|
+
* as recall 1 (nothing to recall — vacuously complete).
|
|
39
|
+
*/
|
|
40
|
+
export function recallAtK(
|
|
41
|
+
selected: readonly string[],
|
|
42
|
+
groundTruth: ReadonlySet<string>,
|
|
43
|
+
k: number,
|
|
44
|
+
): number {
|
|
45
|
+
if (groundTruth.size === 0) return 1;
|
|
46
|
+
let hit = 0;
|
|
47
|
+
for (const slug of selected.slice(0, k)) {
|
|
48
|
+
if (groundTruth.has(slug)) hit++;
|
|
49
|
+
}
|
|
50
|
+
return hit / groundTruth.size;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export function evalTurn(
|
|
54
|
+
output: RetrievalOutput,
|
|
55
|
+
groundTruth: readonly string[],
|
|
56
|
+
ks: readonly number[],
|
|
57
|
+
): TurnEval {
|
|
58
|
+
const gtList = Array.from(new Set(groundTruth));
|
|
59
|
+
const gtSet = new Set(gtList);
|
|
60
|
+
const selectedSet = new Set(output.selectedSlugs);
|
|
61
|
+
|
|
62
|
+
const hits: string[] = [];
|
|
63
|
+
const misses: string[] = [];
|
|
64
|
+
for (const slug of gtList) {
|
|
65
|
+
(selectedSet.has(slug) ? hits : misses).push(slug);
|
|
66
|
+
}
|
|
67
|
+
const extras = output.selectedSlugs.filter((s) => !gtSet.has(s));
|
|
68
|
+
|
|
69
|
+
const recall: Record<number, number> = {};
|
|
70
|
+
for (const k of ks) {
|
|
71
|
+
recall[k] = recallAtK(output.selectedSlugs, gtSet, k);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const hitsByLane: Record<string, number> = {};
|
|
75
|
+
for (const slug of hits) {
|
|
76
|
+
const lane = output.sourceBySlug.get(slug) ?? "unknown";
|
|
77
|
+
hitsByLane[lane] = (hitsByLane[lane] ?? 0) + 1;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
groundTruth: gtList,
|
|
82
|
+
selected: output.selectedSlugs,
|
|
83
|
+
hits,
|
|
84
|
+
misses,
|
|
85
|
+
extras,
|
|
86
|
+
recallAtK: recall,
|
|
87
|
+
hitsByLane,
|
|
88
|
+
...(output.cost?.usd !== undefined ? { costUsd: output.cost.usd } : {}),
|
|
89
|
+
failureReason: output.failureReason ?? null,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export function aggregate(
|
|
94
|
+
perTurn: readonly TurnEval[],
|
|
95
|
+
ks: readonly number[],
|
|
96
|
+
): AggregateEval {
|
|
97
|
+
const turns = perTurn.length;
|
|
98
|
+
|
|
99
|
+
const meanRecallAtK: Record<number, number> = {};
|
|
100
|
+
for (const k of ks) {
|
|
101
|
+
if (turns === 0) {
|
|
102
|
+
meanRecallAtK[k] = 0;
|
|
103
|
+
continue;
|
|
104
|
+
}
|
|
105
|
+
let sum = 0;
|
|
106
|
+
for (const t of perTurn) sum += t.recallAtK[k] ?? 0;
|
|
107
|
+
meanRecallAtK[k] = sum / turns;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const failures = perTurn.filter((t) => t.failureReason != null).length;
|
|
111
|
+
const costed = perTurn.filter((t) => t.costUsd !== undefined);
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
turns,
|
|
115
|
+
meanRecallAtK,
|
|
116
|
+
failureRate: turns === 0 ? 0 : failures / turns,
|
|
117
|
+
...(costed.length > 0
|
|
118
|
+
? {
|
|
119
|
+
meanCostUsd:
|
|
120
|
+
costed.reduce((s, t) => s + (t.costUsd ?? 0), 0) / costed.length,
|
|
121
|
+
}
|
|
122
|
+
: {}),
|
|
123
|
+
};
|
|
124
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Oracle extraction — the current router's logged selections as silver-standard
|
|
3
|
+
* ground truth.
|
|
4
|
+
*
|
|
5
|
+
* Source: `memory_v2_activation_logs` rows with `mode = 'router'`. Each row's
|
|
6
|
+
* `messageId` is backfilled to the turn's assistant message (see
|
|
7
|
+
* `backfillMemoryV2ActivationMessageId`), so we join `messageId → messages.id`
|
|
8
|
+
* to anchor the turn — robust, no fragile turn-counting. Rows whose messageId
|
|
9
|
+
* is null (the in-flight turn) or no longer resolves are skipped.
|
|
10
|
+
*
|
|
11
|
+
* Ground truth G(turn) = selected slugs with status ∈ {injected, in_context}
|
|
12
|
+
* (what actually reached the model), optionally + not_injected, and — when a
|
|
13
|
+
* `pageExists` predicate is supplied — only slugs whose page still exists
|
|
14
|
+
* (neither retriever can find a nonexistent page). page_missing / corrupt are
|
|
15
|
+
* always excluded.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { and, desc, eq, inArray, isNotNull, sql } from "drizzle-orm";
|
|
19
|
+
|
|
20
|
+
import type { DrizzleDb } from "../../db-connection.js";
|
|
21
|
+
import type {
|
|
22
|
+
MemoryV2ConceptRowRecord,
|
|
23
|
+
MemoryV2ConfigSnapshot,
|
|
24
|
+
} from "../../memory-v2-activation-log-store.js";
|
|
25
|
+
import { memoryV2ActivationLogs, messages } from "../../schema.js";
|
|
26
|
+
|
|
27
|
+
export interface OracleTurn {
|
|
28
|
+
conversationId: string;
|
|
29
|
+
turn: number;
|
|
30
|
+
/** Backfilled assistant-message id for this turn — the reconstruction anchor. */
|
|
31
|
+
anchorMessageId: string;
|
|
32
|
+
/** `created_at` of the anchor message; reconstruction cuts strictly before it. */
|
|
33
|
+
anchorCreatedAt: number;
|
|
34
|
+
/** Slugs the router's judgment put in front of the model (the recall target). */
|
|
35
|
+
groundTruthSlugs: string[];
|
|
36
|
+
loggedConfig: MemoryV2ConfigSnapshot;
|
|
37
|
+
createdAt: number;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export interface ExtractOracleOptions {
|
|
41
|
+
/** Max log rows to scan (default 50). Some are skipped, so result ≤ limit. */
|
|
42
|
+
limit?: number;
|
|
43
|
+
strategy?: "recent" | "random";
|
|
44
|
+
conversationIds?: string[];
|
|
45
|
+
/** Include status "not_injected" (selected but cut by the cap) in G. Default false. */
|
|
46
|
+
includeNotInjected?: boolean;
|
|
47
|
+
/**
|
|
48
|
+
* Page-existence predicate, typically backed by `getPageIndex().bySlug`.
|
|
49
|
+
* When provided, ground-truth slugs whose page no longer exists are dropped.
|
|
50
|
+
* Omit in unit tests.
|
|
51
|
+
*/
|
|
52
|
+
pageExists?: (slug: string) => boolean;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function extractOracleTurns(
|
|
56
|
+
db: DrizzleDb,
|
|
57
|
+
options: ExtractOracleOptions = {},
|
|
58
|
+
): OracleTurn[] {
|
|
59
|
+
const {
|
|
60
|
+
limit = 50,
|
|
61
|
+
strategy = "recent",
|
|
62
|
+
conversationIds,
|
|
63
|
+
includeNotInjected = false,
|
|
64
|
+
pageExists,
|
|
65
|
+
} = options;
|
|
66
|
+
|
|
67
|
+
const allowedStatuses = new Set<string>(["injected", "in_context"]);
|
|
68
|
+
if (includeNotInjected) allowedStatuses.add("not_injected");
|
|
69
|
+
|
|
70
|
+
const filters = [
|
|
71
|
+
eq(memoryV2ActivationLogs.mode, "router"),
|
|
72
|
+
isNotNull(memoryV2ActivationLogs.messageId),
|
|
73
|
+
];
|
|
74
|
+
if (conversationIds && conversationIds.length > 0) {
|
|
75
|
+
filters.push(
|
|
76
|
+
inArray(memoryV2ActivationLogs.conversationId, conversationIds),
|
|
77
|
+
);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const rows = db
|
|
81
|
+
.select({
|
|
82
|
+
conversationId: memoryV2ActivationLogs.conversationId,
|
|
83
|
+
messageId: memoryV2ActivationLogs.messageId,
|
|
84
|
+
turn: memoryV2ActivationLogs.turn,
|
|
85
|
+
conceptsJson: memoryV2ActivationLogs.conceptsJson,
|
|
86
|
+
configJson: memoryV2ActivationLogs.configJson,
|
|
87
|
+
createdAt: memoryV2ActivationLogs.createdAt,
|
|
88
|
+
})
|
|
89
|
+
.from(memoryV2ActivationLogs)
|
|
90
|
+
.where(and(...filters))
|
|
91
|
+
.orderBy(
|
|
92
|
+
strategy === "random"
|
|
93
|
+
? sql`RANDOM()`
|
|
94
|
+
: desc(memoryV2ActivationLogs.createdAt),
|
|
95
|
+
)
|
|
96
|
+
.limit(limit)
|
|
97
|
+
.all();
|
|
98
|
+
|
|
99
|
+
const turns: OracleTurn[] = [];
|
|
100
|
+
for (const row of rows) {
|
|
101
|
+
const messageId = row.messageId;
|
|
102
|
+
if (messageId == null) continue;
|
|
103
|
+
|
|
104
|
+
const anchor = db
|
|
105
|
+
.select({ createdAt: messages.createdAt })
|
|
106
|
+
.from(messages)
|
|
107
|
+
.where(eq(messages.id, messageId))
|
|
108
|
+
.limit(1)
|
|
109
|
+
.all();
|
|
110
|
+
const anchorRow = anchor[0];
|
|
111
|
+
if (!anchorRow) continue;
|
|
112
|
+
|
|
113
|
+
let concepts: MemoryV2ConceptRowRecord[];
|
|
114
|
+
let loggedConfig: MemoryV2ConfigSnapshot;
|
|
115
|
+
try {
|
|
116
|
+
concepts = JSON.parse(row.conceptsJson) as MemoryV2ConceptRowRecord[];
|
|
117
|
+
loggedConfig = JSON.parse(row.configJson) as MemoryV2ConfigSnapshot;
|
|
118
|
+
} catch {
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const seen = new Set<string>();
|
|
123
|
+
const groundTruthSlugs: string[] = [];
|
|
124
|
+
for (const concept of concepts) {
|
|
125
|
+
if (!allowedStatuses.has(concept.status)) continue;
|
|
126
|
+
if (pageExists && !pageExists(concept.slug)) continue;
|
|
127
|
+
if (seen.has(concept.slug)) continue;
|
|
128
|
+
seen.add(concept.slug);
|
|
129
|
+
groundTruthSlugs.push(concept.slug);
|
|
130
|
+
}
|
|
131
|
+
if (groundTruthSlugs.length === 0) continue;
|
|
132
|
+
|
|
133
|
+
turns.push({
|
|
134
|
+
conversationId: row.conversationId,
|
|
135
|
+
turn: row.turn,
|
|
136
|
+
anchorMessageId: messageId,
|
|
137
|
+
anchorCreatedAt: anchorRow.createdAt,
|
|
138
|
+
groundTruthSlugs,
|
|
139
|
+
loggedConfig,
|
|
140
|
+
createdAt: row.createdAt,
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return turns;
|
|
145
|
+
}
|