create-walle 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -5
- package/package.json +2 -2
- package/template/CLAUDE.md +2 -2
- package/template/LICENSE +1 -1
- package/template/bin/ctm-dev-cleanup.js +24 -3
- package/template/bin/ctm-launch.sh +13 -0
- package/template/bin/dev.sh +156 -18
- package/template/bin/node-bin.sh +84 -0
- package/template/bin/pin-node.sh +51 -0
- package/template/claude-task-manager/api-prompts.js +1203 -182
- package/template/claude-task-manager/api-reviews.js +109 -15
- package/template/claude-task-manager/approval-agent.js +1360 -280
- package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
- package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
- package/template/claude-task-manager/db.js +4417 -295
- package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
- package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
- package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
- package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
- package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
- package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
- package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
- package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
- package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
- package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
- package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
- package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
- package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
- package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
- package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
- package/template/claude-task-manager/docs/phone-access-design.md +53 -15
- package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
- package/template/claude-task-manager/docs/phone-setup.md +3 -0
- package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
- package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
- package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
- package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
- package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
- package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
- package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
- package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
- package/template/claude-task-manager/docs/session-title-authority.md +32 -0
- package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
- package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
- package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
- package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
- package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
- package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
- package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
- package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
- package/template/claude-task-manager/git-utils.js +897 -27
- package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
- package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
- package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
- package/template/claude-task-manager/lib/agent-presets.js +17 -1
- package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
- package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
- package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
- package/template/claude-task-manager/lib/async-semaphore.js +44 -0
- package/template/claude-task-manager/lib/auth-context.js +5 -0
- package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
- package/template/claude-task-manager/lib/auth-rules.js +29 -2
- package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
- package/template/claude-task-manager/lib/background-llm.js +144 -17
- package/template/claude-task-manager/lib/branch-inventory.js +212 -0
- package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
- package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
- package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
- package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
- package/template/claude-task-manager/lib/codex-zst.js +124 -0
- package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
- package/template/claude-task-manager/lib/connection-health.js +232 -0
- package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
- package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
- package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
- package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
- package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
- package/template/claude-task-manager/lib/document-review.js +141 -6
- package/template/claude-task-manager/lib/escalation-review.js +152 -0
- package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
- package/template/claude-task-manager/lib/headless-term-service.js +678 -0
- package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
- package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
- package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
- package/template/claude-task-manager/lib/main-db-census.js +216 -0
- package/template/claude-task-manager/lib/message-pagination.js +106 -4
- package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
- package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
- package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
- package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
- package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
- package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
- package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
- package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
- package/template/claude-task-manager/lib/perf-tracker.js +242 -6
- package/template/claude-task-manager/lib/permission-match.js +76 -0
- package/template/claude-task-manager/lib/permission-sync.js +133 -20
- package/template/claude-task-manager/lib/process-title.js +35 -0
- package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
- package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
- package/template/claude-task-manager/lib/prompt-intent.js +132 -0
- package/template/claude-task-manager/lib/provider-user-context.js +34 -0
- package/template/claude-task-manager/lib/read-pool-client.js +313 -0
- package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
- package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
- package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
- package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
- package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
- package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
- package/template/claude-task-manager/lib/restart-guard.js +109 -0
- package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
- package/template/claude-task-manager/lib/restore-policy.js +13 -0
- package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
- package/template/claude-task-manager/lib/restore-runtime.js +68 -0
- package/template/claude-task-manager/lib/restore-storm.js +34 -0
- package/template/claude-task-manager/lib/resume-cwd.js +36 -0
- package/template/claude-task-manager/lib/resume-preflight.js +313 -0
- package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
- package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
- package/template/claude-task-manager/lib/scheduler.js +21 -1
- package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
- package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
- package/template/claude-task-manager/lib/server-listeners.js +239 -0
- package/template/claude-task-manager/lib/session-capture.js +42 -7
- package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
- package/template/claude-task-manager/lib/session-history.js +388 -43
- package/template/claude-task-manager/lib/session-host-manager.js +287 -0
- package/template/claude-task-manager/lib/session-image-refs.js +209 -0
- package/template/claude-task-manager/lib/session-jobs.js +399 -59
- package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
- package/template/claude-task-manager/lib/session-restore.js +53 -0
- package/template/claude-task-manager/lib/session-standup.js +123 -23
- package/template/claude-task-manager/lib/session-state-bus.js +14 -0
- package/template/claude-task-manager/lib/session-stream.js +64 -16
- package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
- package/template/claude-task-manager/lib/session-token-usage.js +494 -0
- package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
- package/template/claude-task-manager/lib/setup-network-config.js +9 -0
- package/template/claude-task-manager/lib/size-cap.js +45 -0
- package/template/claude-task-manager/lib/size-cap.test.js +62 -0
- package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
- package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
- package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
- package/template/claude-task-manager/lib/standup-attention.js +7 -3
- package/template/claude-task-manager/lib/status-authority.js +39 -0
- package/template/claude-task-manager/lib/status-hooks.js +4 -0
- package/template/claude-task-manager/lib/storage-migration.js +235 -0
- package/template/claude-task-manager/lib/structured-capture.js +298 -0
- package/template/claude-task-manager/lib/sync-io-census.js +163 -0
- package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
- package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
- package/template/claude-task-manager/lib/terminal-choice.js +364 -0
- package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
- package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
- package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
- package/template/claude-task-manager/lib/timeline-order.js +122 -0
- package/template/claude-task-manager/lib/transcript-store.js +348 -43
- package/template/claude-task-manager/lib/transport-security.js +84 -1
- package/template/claude-task-manager/lib/wait-state.js +184 -0
- package/template/claude-task-manager/lib/walle-client.js +47 -5
- package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
- package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
- package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
- package/template/claude-task-manager/lib/walle-native-health.js +403 -0
- package/template/claude-task-manager/lib/walle-repair.js +701 -0
- package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
- package/template/claude-task-manager/lib/walle-session-context.js +57 -21
- package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
- package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
- package/template/claude-task-manager/lib/walle-transcript.js +52 -0
- package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
- package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
- package/template/claude-task-manager/package.json +1 -1
- package/template/claude-task-manager/prompt-harvest.js +89 -66
- package/template/claude-task-manager/providers/claude-code.js +51 -3
- package/template/claude-task-manager/providers/cursor.js +140 -45
- package/template/claude-task-manager/public/css/reviews.css +551 -61
- package/template/claude-task-manager/public/css/setup.css +191 -0
- package/template/claude-task-manager/public/css/walle-session.css +865 -10
- package/template/claude-task-manager/public/css/walle.css +154 -0
- package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
- package/template/claude-task-manager/public/index.html +18516 -2058
- package/template/claude-task-manager/public/ipad.html +363 -0
- package/template/claude-task-manager/public/js/document-review-links.js +301 -0
- package/template/claude-task-manager/public/js/image-normalize.js +69 -36
- package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
- package/template/claude-task-manager/public/js/prompts.js +66 -29
- package/template/claude-task-manager/public/js/reviews.js +901 -133
- package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
- package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
- package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
- package/template/claude-task-manager/public/js/setup.js +1273 -176
- package/template/claude-task-manager/public/js/stream-view.js +691 -73
- package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
- package/template/claude-task-manager/public/js/walle-session.js +2455 -158
- package/template/claude-task-manager/public/js/walle.js +455 -28
- package/template/claude-task-manager/public/m/app.css +2909 -262
- package/template/claude-task-manager/public/m/app.js +6601 -398
- package/template/claude-task-manager/public/m/claim.html +224 -17
- package/template/claude-task-manager/public/m/index.html +117 -21
- package/template/claude-task-manager/public/m/sw.js +3 -1
- package/template/claude-task-manager/public/manifest.json +2 -2
- package/template/claude-task-manager/public/prompts.html +30 -14
- package/template/claude-task-manager/queue-engine.js +507 -28
- package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
- package/template/claude-task-manager/server.js +14341 -2197
- package/template/claude-task-manager/session-integrity.js +160 -18
- package/template/claude-task-manager/session-search-ranking.js +1 -0
- package/template/claude-task-manager/session-utils.js +25 -5
- package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
- package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
- package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
- package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
- package/template/claude-task-manager/workers/harvest-worker.js +9 -55
- package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
- package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
- package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
- package/template/claude-task-manager/workers/session-host-process.js +146 -0
- package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
- package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
- package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
- package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
- package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
- package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
- package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
- package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
- package/template/docs/design/markdown-review-pane.md +206 -0
- package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
- package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
- package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
- package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
- package/template/docs/private-memory-and-pii-policy.md +69 -0
- package/template/package.json +2 -1
- package/template/scripts/check-private-data.js +201 -0
- package/template/shared/sqlite-owner-guard.js +30 -0
- package/template/shared/sqlite-owner-write-queue.js +225 -0
- package/template/shared/sqlite-storage-policy.js +111 -0
- package/template/shared/sqlite-write-lock.js +428 -0
- package/template/wall-e/agent-runners/claude-code.js +5 -0
- package/template/wall-e/agent.js +166 -22
- package/template/wall-e/api-walle.js +524 -70
- package/template/wall-e/auth/provider-flows.js +11 -1
- package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
- package/template/wall-e/brain.js +1614 -141
- package/template/wall-e/chat/attachment-blocks.js +96 -0
- package/template/wall-e/chat/attachments.js +2 -1
- package/template/wall-e/chat/capability-resolver.js +7 -7
- package/template/wall-e/chat/context-messages.js +28 -0
- package/template/wall-e/chat/conversation-frame.js +630 -0
- package/template/wall-e/chat/provider-messages.js +125 -0
- package/template/wall-e/chat.js +1002 -233
- package/template/wall-e/coding/acceptance-contract.js +170 -0
- package/template/wall-e/coding/acp-adapter.js +1 -1
- package/template/wall-e/coding/agent-catalog.js +3 -0
- package/template/wall-e/coding/artifact-store.js +93 -0
- package/template/wall-e/coding/capability-router.js +120 -0
- package/template/wall-e/coding/coding-run-controller.js +423 -0
- package/template/wall-e/coding/compaction-service.js +157 -12
- package/template/wall-e/coding/frontend-verification.js +258 -0
- package/template/wall-e/coding/lifecycle-hooks.js +75 -0
- package/template/wall-e/coding/local-preview-contract.js +157 -0
- package/template/wall-e/coding/permission-service.js +57 -13
- package/template/wall-e/coding/prompt-bundle.js +19 -1
- package/template/wall-e/coding/prompt-section-registry.js +227 -0
- package/template/wall-e/coding/provider-compat.js +15 -0
- package/template/wall-e/coding/runtime-events.js +224 -0
- package/template/wall-e/coding/runtime-mode.js +3 -0
- package/template/wall-e/coding/side-git-snapshot.js +160 -4
- package/template/wall-e/coding/snapshot-service.js +143 -1
- package/template/wall-e/coding/stream-processor.js +388 -34
- package/template/wall-e/coding/task-tool.js +141 -4
- package/template/wall-e/coding/tool-execution-controller.js +365 -0
- package/template/wall-e/coding/tool-registry.js +43 -5
- package/template/wall-e/coding/user-hooks.js +217 -0
- package/template/wall-e/coding-orchestrator.js +1330 -221
- package/template/wall-e/coding-prompts.js +20 -4
- package/template/wall-e/context/context-builder.js +15 -2
- package/template/wall-e/decision/confidence.js +1 -1
- package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
- package/template/wall-e/docs/external-action-controller.md +26 -6
- package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
- package/template/wall-e/embeddings.js +591 -53
- package/template/wall-e/external-action-controller.js +12 -0
- package/template/wall-e/http/auth.js +1 -0
- package/template/wall-e/http/chat-api.js +46 -11
- package/template/wall-e/http/model-admin.js +836 -34
- package/template/wall-e/lib/boot-profile.js +88 -0
- package/template/wall-e/lib/event-loop-monitor.js +93 -0
- package/template/wall-e/lib/service-health.js +194 -0
- package/template/wall-e/llm/anthropic.js +130 -5
- package/template/wall-e/llm/client.js +266 -63
- package/template/wall-e/llm/default-fallback.js +382 -0
- package/template/wall-e/llm/health.js +19 -0
- package/template/wall-e/llm/message-guard.js +78 -0
- package/template/wall-e/llm/model-catalog.js +252 -1
- package/template/wall-e/llm/openai.js +26 -4
- package/template/wall-e/llm/portkey-sync.js +654 -0
- package/template/wall-e/llm/provider-error.js +30 -2
- package/template/wall-e/llm/registry.js +5 -1
- package/template/wall-e/llm/request-compat.js +67 -0
- package/template/wall-e/loops/backfill.js +79 -23
- package/template/wall-e/loops/brain-optimize.js +67 -0
- package/template/wall-e/loops/ingest.js +25 -10
- package/template/wall-e/loops/question-digest.js +160 -0
- package/template/wall-e/loops/reflect.js +6 -4
- package/template/wall-e/loops/think.js +39 -12
- package/template/wall-e/mcp-server.js +318 -36
- package/template/wall-e/memory/ctm-context-client.js +52 -14
- package/template/wall-e/memory/ctm-operational-context.js +237 -0
- package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
- package/template/wall-e/memory/ctm-session-context.js +111 -63
- package/template/wall-e/prompts/coding/deepseek.txt +3 -0
- package/template/wall-e/prompts/coding/gemini.txt +6 -0
- package/template/wall-e/prompts/coding/gpt.txt +6 -0
- package/template/wall-e/prompts/coding/local.txt +7 -0
- package/template/wall-e/runtime/decision-hooks.js +115 -0
- package/template/wall-e/runtime/devbox-gateway.js +82 -8
- package/template/wall-e/runtime/prompt-manifest.js +86 -0
- package/template/wall-e/runtime/tool-executor.js +269 -0
- package/template/wall-e/runtime/tool-result-envelope.js +138 -0
- package/template/wall-e/runtime/transcript-projection.js +60 -0
- package/template/wall-e/runtime/walle-runtime.js +224 -0
- package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
- package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
- package/template/wall-e/server.js +15 -0
- package/template/wall-e/session-files.js +9 -0
- package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
- package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
- package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
- package/template/wall-e/skills/claude-code-reader.js +7 -3
- package/template/wall-e/skills/script-skill-runner.js +10 -0
- package/template/wall-e/skills/skill-planner.js +38 -0
- package/template/wall-e/tools/builtin-middleware.js +19 -9
- package/template/wall-e/tools/local-tools.js +1428 -16
- package/template/wall-e/tools/permission-checker.js +73 -5
- package/template/wall-e/tools/question-manager.js +117 -7
- package/template/wall-e/training/harvester.js +12 -28
- package/template/wall-e/training/replay.js +25 -80
- package/template/website/index.html +10 -10
- package/template/wall-e/eval/ab-test.js +0 -203
- package/template/wall-e/eval/agent-runner.js +0 -772
- package/template/wall-e/eval/agent-scorer.js +0 -461
- package/template/wall-e/eval/aggregator.js +0 -414
- package/template/wall-e/eval/allowed-test-commands.js +0 -34
- package/template/wall-e/eval/benchmark-generator.js +0 -113
- package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
- package/template/wall-e/eval/benchmarks/chat.json +0 -82
- package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
- package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
- package/template/wall-e/eval/benchmarks/coding.json +0 -122
- package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
- package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
- package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
- package/template/wall-e/eval/benchmarks.js +0 -669
- package/template/wall-e/eval/cc-replay.js +0 -719
- package/template/wall-e/eval/chat-eval.js +0 -525
- package/template/wall-e/eval/check-keys.js +0 -15
- package/template/wall-e/eval/check-providers.js +0 -42
- package/template/wall-e/eval/codex-cli-baseline.js +0 -669
- package/template/wall-e/eval/coding-agent-real.js +0 -570
- package/template/wall-e/eval/context-compactor.js +0 -251
- package/template/wall-e/eval/debug-agent003.js +0 -68
- package/template/wall-e/eval/diagnostics.js +0 -216
- package/template/wall-e/eval/eval-orchestrator.js +0 -642
- package/template/wall-e/eval/evaluate.js +0 -202
- package/template/wall-e/eval/evaluator.js +0 -373
- package/template/wall-e/eval/exporter.js +0 -212
- package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
- package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
- package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
- package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
- package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
- package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
- package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
- package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
- package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
- package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
- package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
- package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
- package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
- package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
- package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
- package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
- package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
- package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
- package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
- package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
- package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
- package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
- package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
- package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
- package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
- package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
- package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
- package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
- package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
- package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
- package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
- package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
- package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
- package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
- package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
- package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
- package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
- package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
- package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
- package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
- package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
- package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
- package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
- package/template/wall-e/eval/harvester.js +0 -685
- package/template/wall-e/eval/head-to-head.js +0 -388
- package/template/wall-e/eval/humaneval-adapter.js +0 -321
- package/template/wall-e/eval/list-models.js +0 -31
- package/template/wall-e/eval/livecodebench-adapter.js +0 -291
- package/template/wall-e/eval/mail-integration.js +0 -443
- package/template/wall-e/eval/manifest.js +0 -186
- package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
- package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
- package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
- package/template/wall-e/eval/meta-harness/cli.js +0 -86
- package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
- package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
- package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
- package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
- package/template/wall-e/eval/meta-harness/frontier.js +0 -96
- package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
- package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
- package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
- package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
- package/template/wall-e/eval/meta-harness/reporting.js +0 -58
- package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
- package/template/wall-e/eval/meta-harness/validation.js +0 -81
- package/template/wall-e/eval/promoter.js +0 -228
- package/template/wall-e/eval/provider-normalizer.js +0 -33
- package/template/wall-e/eval/replay.js +0 -395
- package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
- package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
- package/template/wall-e/eval/run-coding-agent-real.js +0 -187
- package/template/wall-e/eval/run-eval.js +0 -435
- package/template/wall-e/eval/run-model-comparison.js +0 -142
- package/template/wall-e/eval/session-evaluator.js +0 -187
- package/template/wall-e/eval/session-miner.js +0 -207
- package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
- package/template/wall-e/eval/session-transcripts.js +0 -509
- package/template/wall-e/eval/shadow.js +0 -161
- package/template/wall-e/eval/swebench-adapter.js +0 -345
- package/template/wall-e/eval/swebench-docker.js +0 -192
- package/template/wall-e/eval/train.py +0 -320
- package/template/wall-e/eval/trainer.js +0 -232
- package/template/wall-e/eval/weekly-eval-loop.js +0 -241
|
@@ -1,443 +0,0 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* mail-integration.js — Integration tests for mail tools.
|
|
5
|
-
*
|
|
6
|
-
* Unlike chat-eval.js (which mocks tool results), these tests call the REAL
|
|
7
|
-
* searchMail(), getMailMessages(), and readMailMessage() functions against
|
|
8
|
-
* the live macOS Mail app. They catch the class of bugs that mocks cannot:
|
|
9
|
-
* - AppleScript/JXA timeouts on large mailboxes
|
|
10
|
-
* - INBOX case sensitivity
|
|
11
|
-
* - Sent folder name mismatches across accounts (Gmail vs iCloud vs Exchange)
|
|
12
|
-
* - TCC/FDA permission failures
|
|
13
|
-
* - Delimiter collisions in email content
|
|
14
|
-
* - Envelope Index (Swift fast path) compilation & access
|
|
15
|
-
*
|
|
16
|
-
* Usage:
|
|
17
|
-
* node eval/mail-integration.js # run all tests
|
|
18
|
-
* node eval/mail-integration.js --filter timeout # run matching tests
|
|
19
|
-
* node eval/mail-integration.js --dry-run # show tests without running
|
|
20
|
-
*/
|
|
21
|
-
|
|
22
|
-
const path = require('path');
|
|
23
|
-
const { execFileAsync } = (() => {
|
|
24
|
-
const { execFile } = require('child_process');
|
|
25
|
-
const { promisify } = require('util');
|
|
26
|
-
return { execFileAsync: promisify(execFile) };
|
|
27
|
-
})();
|
|
28
|
-
|
|
29
|
-
// Load real tool functions
|
|
30
|
-
const {
|
|
31
|
-
getMailMessages, readMailMessage, searchMail,
|
|
32
|
-
} = require('../tools/local-tools');
|
|
33
|
-
|
|
34
|
-
const TIMEOUT_MS = 120000; // per-test timeout (Mail.app scripting bridge takes ~24-47s on large mailboxes, up to 120s without FDA)
|
|
35
|
-
|
|
36
|
-
// ============================================================
|
|
37
|
-
// Test helpers
|
|
38
|
-
// ============================================================
|
|
39
|
-
|
|
40
|
-
async function runWithTimeout(fn, ms) {
|
|
41
|
-
return Promise.race([
|
|
42
|
-
fn(),
|
|
43
|
-
new Promise((_, reject) => setTimeout(() => reject(new Error(`Timed out after ${ms}ms`)), ms)),
|
|
44
|
-
]);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
function assert(condition, msg) {
|
|
48
|
-
if (!condition) throw new Error(`Assertion failed: ${msg}`);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
// ============================================================
|
|
52
|
-
// Test cases — each returns { passed, detail } or throws
|
|
53
|
-
// ============================================================
|
|
54
|
-
|
|
55
|
-
const TESTS = [
|
|
56
|
-
// --- Basic functionality ---
|
|
57
|
-
{
|
|
58
|
-
id: 'mail-int-01',
|
|
59
|
-
name: 'getMailMessages returns structured JSON with count and messages array',
|
|
60
|
-
tags: ['basic', 'mail_messages'],
|
|
61
|
-
async run() {
|
|
62
|
-
const result = await runWithTimeout(() => getMailMessages({ hours_back: 72, limit: 5 }), TIMEOUT_MS);
|
|
63
|
-
assert(typeof result === 'object', 'result should be an object');
|
|
64
|
-
assert('count' in result || 'error' in result, 'result should have count or error');
|
|
65
|
-
if (result.error) {
|
|
66
|
-
// Acceptable errors: Mail not running, automation denied, scripting bridge timeout
|
|
67
|
-
assert(
|
|
68
|
-
/automation|not running|timed out|timeout|Full Disk|Command failed/i.test(result.error),
|
|
69
|
-
`Unexpected error: ${result.error}`
|
|
70
|
-
);
|
|
71
|
-
return { passed: true, detail: `Expected error: ${result.error.slice(0, 100)}` };
|
|
72
|
-
}
|
|
73
|
-
assert(typeof result.count === 'number', 'count should be a number');
|
|
74
|
-
assert(Array.isArray(result.messages), 'messages should be an array');
|
|
75
|
-
assert(result.count === result.messages.length, `count (${result.count}) should match messages.length (${result.messages.length})`);
|
|
76
|
-
if (result.messages.length > 0) {
|
|
77
|
-
const msg = result.messages[0];
|
|
78
|
-
assert('subject' in msg, 'message should have subject');
|
|
79
|
-
assert('from' in msg, 'message should have from');
|
|
80
|
-
assert('date' in msg, 'message should have date');
|
|
81
|
-
assert('read' in msg, 'message should have read status');
|
|
82
|
-
assert('messageId' in msg, 'message should have messageId');
|
|
83
|
-
}
|
|
84
|
-
const extra = result.slow_accounts ? ` (slow: ${result.slow_accounts.join(', ')})` : '';
|
|
85
|
-
return { passed: true, detail: `Got ${result.count} messages${extra}` };
|
|
86
|
-
},
|
|
87
|
-
},
|
|
88
|
-
|
|
89
|
-
{
|
|
90
|
-
id: 'mail-int-02',
|
|
91
|
-
name: 'getMailMessages completes within 90 seconds (performance regression)',
|
|
92
|
-
tags: ['performance', 'mail_messages', 'timeout'],
|
|
93
|
-
async run() {
|
|
94
|
-
const start = Date.now();
|
|
95
|
-
const result = await runWithTimeout(() => getMailMessages({ hours_back: 24, limit: 10 }), 90000);
|
|
96
|
-
const elapsed = Date.now() - start;
|
|
97
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
98
|
-
return { passed: true, detail: `Skipped (expected): ${result.error.slice(0, 80)}` };
|
|
99
|
-
}
|
|
100
|
-
assert(elapsed < 90000, `Took ${elapsed}ms — should be under 90s.`);
|
|
101
|
-
const extra = result.slow_accounts ? ` (slow: ${result.slow_accounts.join(', ')})` : '';
|
|
102
|
-
return { passed: true, detail: `${elapsed}ms for ${result.count} messages${extra}` };
|
|
103
|
-
},
|
|
104
|
-
},
|
|
105
|
-
|
|
106
|
-
{
|
|
107
|
-
id: 'mail-int-03',
|
|
108
|
-
name: 'getMailMessages with INBOX (uppercase) returns results',
|
|
109
|
-
tags: ['basic', 'mail_messages', 'inbox-case'],
|
|
110
|
-
async run() {
|
|
111
|
-
const result = await runWithTimeout(() => getMailMessages({ hours_back: 24, mailbox: 'INBOX', limit: 3 }), TIMEOUT_MS);
|
|
112
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
113
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 80)}` };
|
|
114
|
-
}
|
|
115
|
-
// INBOX (uppercase) should work on all accounts (may have 0 results if all accounts were slow)
|
|
116
|
-
assert(!result.error, `INBOX (uppercase) failed: ${result.error}`);
|
|
117
|
-
const extra = result.slow_accounts ? ` (slow: ${result.slow_accounts.join(', ')})` : '';
|
|
118
|
-
return { passed: true, detail: `INBOX returned ${result.count} messages${extra}` };
|
|
119
|
-
},
|
|
120
|
-
},
|
|
121
|
-
|
|
122
|
-
{
|
|
123
|
-
id: 'mail-int-04',
|
|
124
|
-
name: 'getMailMessages with lowercase inbox should also work (case sensitivity bug)',
|
|
125
|
-
tags: ['regression', 'mail_messages', 'inbox-case'],
|
|
126
|
-
async run() {
|
|
127
|
-
// This was the original bug: lowercase "inbox" silently returned 0 results on some accounts
|
|
128
|
-
const upper = await runWithTimeout(() => getMailMessages({ hours_back: 24, mailbox: 'INBOX', limit: 5 }), TIMEOUT_MS);
|
|
129
|
-
if (upper.error && /automation|not running|timed out|timeout|Full Disk/i.test(upper.error)) {
|
|
130
|
-
return { passed: true, detail: `Skipped: ${upper.error.slice(0, 80)}` };
|
|
131
|
-
}
|
|
132
|
-
// The tool now defaults to 'INBOX' — verify the default is uppercase
|
|
133
|
-
const defaultResult = await runWithTimeout(() => getMailMessages({ hours_back: 24, limit: 5 }), TIMEOUT_MS);
|
|
134
|
-
if (defaultResult.error) {
|
|
135
|
-
return { passed: true, detail: `Skipped: ${defaultResult.error.slice(0, 80)}` };
|
|
136
|
-
}
|
|
137
|
-
// Default should return same or similar count as explicit INBOX
|
|
138
|
-
assert(
|
|
139
|
-
Math.abs(upper.count - defaultResult.count) <= 2,
|
|
140
|
-
`Default mailbox (${defaultResult.count}) vs INBOX (${upper.count}) differ significantly — possible case sensitivity issue`
|
|
141
|
-
);
|
|
142
|
-
return { passed: true, detail: `Default=${defaultResult.count}, INBOX=${upper.count}` };
|
|
143
|
-
},
|
|
144
|
-
},
|
|
145
|
-
|
|
146
|
-
// --- searchMail ---
|
|
147
|
-
{
|
|
148
|
-
id: 'mail-int-05',
|
|
149
|
-
name: 'searchMail returns structured JSON with count and messages array',
|
|
150
|
-
tags: ['basic', 'mail_search'],
|
|
151
|
-
async run() {
|
|
152
|
-
// Search for something common — "re:" exists in most inboxes
|
|
153
|
-
const result = await runWithTimeout(() => searchMail({ query: 're:', days_back: 7, limit: 3 }), TIMEOUT_MS);
|
|
154
|
-
assert(typeof result === 'object', 'result should be an object');
|
|
155
|
-
assert('count' in result || 'error' in result, 'result should have count or error');
|
|
156
|
-
if (result.error) {
|
|
157
|
-
assert(
|
|
158
|
-
/automation|not running|timed out|timeout|Full Disk|Command failed/i.test(result.error),
|
|
159
|
-
`Unexpected error: ${result.error}`
|
|
160
|
-
);
|
|
161
|
-
return { passed: true, detail: `Expected error: ${result.error.slice(0, 100)}` };
|
|
162
|
-
}
|
|
163
|
-
assert(typeof result.count === 'number', 'count should be a number');
|
|
164
|
-
assert(Array.isArray(result.messages), 'messages should be an array');
|
|
165
|
-
if (result.messages.length > 0) {
|
|
166
|
-
const msg = result.messages[0];
|
|
167
|
-
assert('subject' in msg, 'message should have subject');
|
|
168
|
-
assert('from' in msg || 'sender' in msg, 'message should have from/sender');
|
|
169
|
-
assert('date' in msg, 'message should have date');
|
|
170
|
-
}
|
|
171
|
-
return { passed: true, detail: `Got ${result.count} results` };
|
|
172
|
-
},
|
|
173
|
-
},
|
|
174
|
-
|
|
175
|
-
{
|
|
176
|
-
id: 'mail-int-06',
|
|
177
|
-
name: 'searchMail completes within 45 seconds with days_back=30 (performance regression)',
|
|
178
|
-
tags: ['performance', 'mail_search', 'timeout'],
|
|
179
|
-
async run() {
|
|
180
|
-
const start = Date.now();
|
|
181
|
-
const result = await runWithTimeout(() => searchMail({ query: 'test', days_back: 30, limit: 5 }), 45000);
|
|
182
|
-
const elapsed = Date.now() - start;
|
|
183
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
184
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 60)}` };
|
|
185
|
-
}
|
|
186
|
-
assert(elapsed < 45000, `Took ${elapsed}ms — should be under 45s. The JXA whose clause is O(n) on large mailboxes.`);
|
|
187
|
-
return { passed: true, detail: `${elapsed}ms for ${result.count} results` };
|
|
188
|
-
},
|
|
189
|
-
},
|
|
190
|
-
|
|
191
|
-
{
|
|
192
|
-
id: 'mail-int-07',
|
|
193
|
-
name: 'searchMail with empty query and no sender returns error',
|
|
194
|
-
tags: ['validation', 'mail_search'],
|
|
195
|
-
async run() {
|
|
196
|
-
const result = await searchMail({});
|
|
197
|
-
assert(result.error, 'Empty search should return an error');
|
|
198
|
-
assert(result.count === 0, 'Empty search should return count 0');
|
|
199
|
-
return { passed: true, detail: `Error: ${result.error}` };
|
|
200
|
-
},
|
|
201
|
-
},
|
|
202
|
-
|
|
203
|
-
{
|
|
204
|
-
id: 'mail-int-08',
|
|
205
|
-
name: 'searchMail with include_content returns email body',
|
|
206
|
-
tags: ['basic', 'mail_search', 'include-content'],
|
|
207
|
-
async run() {
|
|
208
|
-
const result = await runWithTimeout(() => searchMail({ query: 're:', days_back: 7, limit: 1, include_content: true }), TIMEOUT_MS);
|
|
209
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
210
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 60)}` };
|
|
211
|
-
}
|
|
212
|
-
if (result.count === 0) {
|
|
213
|
-
return { passed: true, detail: 'No messages found to verify content' };
|
|
214
|
-
}
|
|
215
|
-
const msg = result.messages[0];
|
|
216
|
-
assert('content' in msg, 'Message should have content when include_content=true');
|
|
217
|
-
assert(typeof msg.content === 'string', 'Content should be a string');
|
|
218
|
-
return { passed: true, detail: `Got content (${msg.content.length} chars) for "${msg.subject?.slice(0, 40)}"` };
|
|
219
|
-
},
|
|
220
|
-
},
|
|
221
|
-
|
|
222
|
-
{
|
|
223
|
-
id: 'mail-int-09',
|
|
224
|
-
name: 'searchMail searches sent folder (cross-provider folder name bug)',
|
|
225
|
-
tags: ['regression', 'mail_search', 'sent-folder'],
|
|
226
|
-
async run() {
|
|
227
|
-
// This was the bug: "sent messages" only works for iCloud, not Gmail/Exchange
|
|
228
|
-
const result = await runWithTimeout(() => searchMail({ query: 're:', days_back: 30, limit: 5 }), TIMEOUT_MS);
|
|
229
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
230
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 60)}` };
|
|
231
|
-
}
|
|
232
|
-
// Check if any results come from a sent folder
|
|
233
|
-
const sentMsgs = result.messages.filter(m =>
|
|
234
|
-
/sent/i.test(m.mailbox || '') || /sent/i.test(m.folder || '')
|
|
235
|
-
);
|
|
236
|
-
// We can't guarantee sent results, but verify the structure is correct
|
|
237
|
-
return { passed: true, detail: `${result.count} results, ${sentMsgs.length} from sent folders` };
|
|
238
|
-
},
|
|
239
|
-
},
|
|
240
|
-
|
|
241
|
-
{
|
|
242
|
-
id: 'mail-int-10',
|
|
243
|
-
name: 'searchMail does not produce delimiter collisions in results',
|
|
244
|
-
tags: ['regression', 'mail_search', 'delimiter'],
|
|
245
|
-
async run() {
|
|
246
|
-
// This was the bug: emails containing " || " in content corrupted parsing
|
|
247
|
-
// The fix was to switch from AppleScript+delimiter to JXA+JSON
|
|
248
|
-
const result = await runWithTimeout(() => searchMail({ query: 're:', days_back: 7, limit: 10, include_content: true }), TIMEOUT_MS);
|
|
249
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
250
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 60)}` };
|
|
251
|
-
}
|
|
252
|
-
// Verify count matches actual array length (delimiter collision would inflate count)
|
|
253
|
-
assert(
|
|
254
|
-
result.count === result.messages.length,
|
|
255
|
-
`count (${result.count}) !== messages.length (${result.messages.length}) — possible delimiter collision`
|
|
256
|
-
);
|
|
257
|
-
// Verify each message has well-formed fields
|
|
258
|
-
for (const msg of result.messages) {
|
|
259
|
-
assert(typeof msg.subject === 'string', `Malformed subject: ${JSON.stringify(msg.subject)}`);
|
|
260
|
-
assert(typeof msg.date === 'string', `Malformed date: ${JSON.stringify(msg.date)}`);
|
|
261
|
-
}
|
|
262
|
-
return { passed: true, detail: `${result.count} results, all well-formed` };
|
|
263
|
-
},
|
|
264
|
-
},
|
|
265
|
-
|
|
266
|
-
// --- Swift fast path ---
|
|
267
|
-
{
|
|
268
|
-
id: 'mail-int-11',
|
|
269
|
-
name: 'Swift Envelope Index binary compiles or is cached',
|
|
270
|
-
tags: ['fast-path', 'swift'],
|
|
271
|
-
async run() {
|
|
272
|
-
const binPath = path.join(__dirname, '..', 'skills', '_bundled', 'email-sync', '.mail-search');
|
|
273
|
-
const srcPath = path.join(__dirname, '..', 'skills', '_bundled', 'email-sync', 'mail-search.swift');
|
|
274
|
-
const fs = require('fs');
|
|
275
|
-
assert(fs.existsSync(srcPath), `Swift source not found: ${srcPath}`);
|
|
276
|
-
// Try to compile if binary doesn't exist
|
|
277
|
-
try {
|
|
278
|
-
await execFileAsync('swiftc', ['-O', srcPath, '-o', binPath], { timeout: 30000 });
|
|
279
|
-
assert(fs.existsSync(binPath), 'Binary should exist after compilation');
|
|
280
|
-
return { passed: true, detail: 'Compiled successfully' };
|
|
281
|
-
} catch (err) {
|
|
282
|
-
if (/not found|ENOENT/.test(err.message)) {
|
|
283
|
-
return { passed: true, detail: 'Swift compiler not available — fast path skipped at runtime' };
|
|
284
|
-
}
|
|
285
|
-
return { passed: false, detail: `Compilation failed: ${err.message.slice(0, 100)}` };
|
|
286
|
-
}
|
|
287
|
-
},
|
|
288
|
-
},
|
|
289
|
-
|
|
290
|
-
{
|
|
291
|
-
id: 'mail-int-12',
|
|
292
|
-
name: 'Swift fast path returns results or gracefully falls back',
|
|
293
|
-
tags: ['fast-path', 'swift', 'tcc'],
|
|
294
|
-
async run() {
|
|
295
|
-
// The fast path requires Full Disk Access (FDA). If not granted, it should
|
|
296
|
-
// return null (not crash), and searchMail falls back to JXA.
|
|
297
|
-
const result = await runWithTimeout(() => searchMail({ query: 're:', days_back: 7, limit: 3 }), TIMEOUT_MS);
|
|
298
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
299
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 60)}` };
|
|
300
|
-
}
|
|
301
|
-
// The key assertion: searchMail should ALWAYS return valid JSON, regardless
|
|
302
|
-
// of whether the fast path worked or fell back
|
|
303
|
-
assert(typeof result === 'object', 'Should return object');
|
|
304
|
-
assert('count' in result, 'Should have count');
|
|
305
|
-
assert(Array.isArray(result.messages), 'Should have messages array');
|
|
306
|
-
const method = result.search_method || 'unknown';
|
|
307
|
-
return { passed: true, detail: `${result.count} results via ${method}` };
|
|
308
|
-
},
|
|
309
|
-
},
|
|
310
|
-
|
|
311
|
-
// --- Sender search ---
|
|
312
|
-
{
|
|
313
|
-
id: 'mail-int-13',
|
|
314
|
-
name: 'searchMail by sender only (no subject query)',
|
|
315
|
-
tags: ['basic', 'mail_search', 'sender'],
|
|
316
|
-
async run() {
|
|
317
|
-
// Search by sender alone — should work without a subject query
|
|
318
|
-
const result = await runWithTimeout(() => searchMail({ sender: '@', days_back: 7, limit: 3 }), TIMEOUT_MS);
|
|
319
|
-
if (result.error) {
|
|
320
|
-
if (/automation|not running|timed out|timeout|Full Disk|Command failed/i.test(result.error)) {
|
|
321
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 80)}` };
|
|
322
|
-
}
|
|
323
|
-
assert(false, `Sender-only search failed: ${result.error}`);
|
|
324
|
-
}
|
|
325
|
-
return { passed: true, detail: `${result.count} results for sender '@'` };
|
|
326
|
-
},
|
|
327
|
-
},
|
|
328
|
-
|
|
329
|
-
// --- Error handling ---
|
|
330
|
-
{
|
|
331
|
-
id: 'mail-int-14',
|
|
332
|
-
name: 'searchMail with SQL injection-like query does not crash',
|
|
333
|
-
tags: ['security', 'mail_search'],
|
|
334
|
-
async run() {
|
|
335
|
-
const result = await runWithTimeout(
|
|
336
|
-
() => searchMail({ query: '\'; DROP TABLE messages; --', days_back: 7, limit: 1 }),
|
|
337
|
-
TIMEOUT_MS
|
|
338
|
-
);
|
|
339
|
-
// Should return 0 results or an error, never crash
|
|
340
|
-
assert(typeof result === 'object', 'Should return object even with malicious input');
|
|
341
|
-
return { passed: true, detail: result.error ? `Error (expected): ${result.error.slice(0, 60)}` : `${result.count} results (safely handled)` };
|
|
342
|
-
},
|
|
343
|
-
},
|
|
344
|
-
|
|
345
|
-
{
|
|
346
|
-
id: 'mail-int-15',
|
|
347
|
-
name: 'searchMail with very large days_back still completes',
|
|
348
|
-
tags: ['performance', 'mail_search'],
|
|
349
|
-
async run() {
|
|
350
|
-
const start = Date.now();
|
|
351
|
-
const result = await runWithTimeout(() => searchMail({ query: 'xyznonexistent99', days_back: 365, limit: 1 }), TIMEOUT_MS);
|
|
352
|
-
const elapsed = Date.now() - start;
|
|
353
|
-
if (result.error && /automation|not running|timed out|timeout|Full Disk/i.test(result.error)) {
|
|
354
|
-
return { passed: true, detail: `Skipped: ${result.error.slice(0, 60)}` };
|
|
355
|
-
}
|
|
356
|
-
assert(elapsed < TIMEOUT_MS, `Took ${elapsed}ms with days_back=365 — potential full-scan`);
|
|
357
|
-
return { passed: true, detail: `${elapsed}ms with days_back=365, ${result.count} results` };
|
|
358
|
-
},
|
|
359
|
-
},
|
|
360
|
-
];
|
|
361
|
-
|
|
362
|
-
// ============================================================
|
|
363
|
-
// Runner
|
|
364
|
-
// ============================================================
|
|
365
|
-
|
|
366
|
-
async function run(filterTag) {
|
|
367
|
-
const tests = filterTag
|
|
368
|
-
? TESTS.filter(t => t.tags.includes(filterTag) || t.id.includes(filterTag) || t.name.toLowerCase().includes(filterTag.toLowerCase()))
|
|
369
|
-
: TESTS;
|
|
370
|
-
|
|
371
|
-
console.log(`\n=== Mail Integration Tests ===`);
|
|
372
|
-
console.log(`Running ${tests.length}/${TESTS.length} tests\n`);
|
|
373
|
-
|
|
374
|
-
let passed = 0;
|
|
375
|
-
let failed = 0;
|
|
376
|
-
let skipped = 0;
|
|
377
|
-
const failures = [];
|
|
378
|
-
|
|
379
|
-
for (const test of tests) {
|
|
380
|
-
process.stdout.write(` [${test.id}] ${test.name.slice(0, 60).padEnd(60)} `);
|
|
381
|
-
try {
|
|
382
|
-
const result = await test.run();
|
|
383
|
-
if (result.passed) {
|
|
384
|
-
const isSkip = /^Skipped:/.test(result.detail);
|
|
385
|
-
if (isSkip) {
|
|
386
|
-
skipped++;
|
|
387
|
-
console.log(`[SKIP] ${result.detail}`);
|
|
388
|
-
} else {
|
|
389
|
-
passed++;
|
|
390
|
-
console.log(`[PASS] ${result.detail}`);
|
|
391
|
-
}
|
|
392
|
-
} else {
|
|
393
|
-
failed++;
|
|
394
|
-
console.log(`[FAIL] ${result.detail}`);
|
|
395
|
-
failures.push({ id: test.id, name: test.name, detail: result.detail });
|
|
396
|
-
}
|
|
397
|
-
} catch (err) {
|
|
398
|
-
failed++;
|
|
399
|
-
const detail = err.message.slice(0, 120);
|
|
400
|
-
console.log(`[FAIL] ${detail}`);
|
|
401
|
-
failures.push({ id: test.id, name: test.name, detail });
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
|
|
405
|
-
console.log(`\n${'='.repeat(60)}`);
|
|
406
|
-
console.log(` ${passed} passed, ${failed} failed, ${skipped} skipped (${tests.length} total)`);
|
|
407
|
-
if (failures.length) {
|
|
408
|
-
console.log(`\n Failures:`);
|
|
409
|
-
for (const f of failures) {
|
|
410
|
-
console.log(` ${f.id}: ${f.detail}`);
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
console.log(`${'='.repeat(60)}\n`);
|
|
414
|
-
|
|
415
|
-
return { passed, failed, skipped, total: tests.length, failures };
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
// ============================================================
|
|
419
|
-
// CLI
|
|
420
|
-
// ============================================================
|
|
421
|
-
|
|
422
|
-
if (require.main === module) {
|
|
423
|
-
const args = process.argv.slice(2);
|
|
424
|
-
const filter = args.find(a => a.startsWith('--filter='))?.split('=')[1]
|
|
425
|
-
|| args.find(a => a.startsWith('--filter'))?.split(' ')[1]
|
|
426
|
-
|| (args[0] && !args[0].startsWith('-') ? args[0] : null);
|
|
427
|
-
const dryRun = args.includes('--dry-run');
|
|
428
|
-
|
|
429
|
-
if (dryRun) {
|
|
430
|
-
console.log(`\nMail Integration Tests (${TESTS.length} total):\n`);
|
|
431
|
-
for (const t of TESTS) {
|
|
432
|
-
console.log(` ${t.id} [${t.tags.join(', ')}]`);
|
|
433
|
-
console.log(` ${t.name}\n`);
|
|
434
|
-
}
|
|
435
|
-
process.exit(0);
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
run(filter)
|
|
439
|
-
.then(r => process.exit(r.failed > 0 ? 1 : 0))
|
|
440
|
-
.catch(err => { console.error('Fatal:', err); process.exit(2); });
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
module.exports = { run, TESTS };
|
|
@@ -1,186 +0,0 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const crypto = require('crypto');
|
|
4
|
-
const path = require('path');
|
|
5
|
-
const { execFileSync } = require('child_process');
|
|
6
|
-
|
|
7
|
-
const DEFAULT_DATASET_VERSION = 'local-v1';
|
|
8
|
-
const DEFAULT_SCORER_VERSION = 'wall-e-eval-v2';
|
|
9
|
-
const DEFAULT_EVALUATOR_VERSION = 'wall-e-evaluator-v2';
|
|
10
|
-
|
|
11
|
-
let cachedRepoSha;
|
|
12
|
-
|
|
13
|
-
function stableStringify(value) {
|
|
14
|
-
if (value === null || typeof value !== 'object') return JSON.stringify(value);
|
|
15
|
-
if (Array.isArray(value)) return '[' + value.map(stableStringify).join(',') + ']';
|
|
16
|
-
return '{' + Object.keys(value).sort().map((key) => (
|
|
17
|
-
JSON.stringify(key) + ':' + stableStringify(value[key])
|
|
18
|
-
)).join(',') + '}';
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
function sha256(value) {
|
|
22
|
-
return crypto.createHash('sha256').update(String(value ?? '')).digest('hex');
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
function hashObject(value) {
|
|
26
|
-
return sha256(stableStringify(value));
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
function getRepoSha(cwd = path.resolve(__dirname, '..')) {
|
|
30
|
-
if (cachedRepoSha !== undefined) return cachedRepoSha;
|
|
31
|
-
try {
|
|
32
|
-
cachedRepoSha = execFileSync('git', ['rev-parse', 'HEAD'], {
|
|
33
|
-
cwd,
|
|
34
|
-
encoding: 'utf8',
|
|
35
|
-
stdio: ['ignore', 'pipe', 'ignore'],
|
|
36
|
-
}).trim() || null;
|
|
37
|
-
} catch {
|
|
38
|
-
cachedRepoSha = null;
|
|
39
|
-
}
|
|
40
|
-
return cachedRepoSha;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
function safeJson(value) {
|
|
44
|
-
if (value == null) return null;
|
|
45
|
-
if (typeof value === 'string') return value;
|
|
46
|
-
try { return JSON.stringify(value); } catch { return null; }
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
function pickRunConfig(config = {}) {
|
|
50
|
-
const allowed = [
|
|
51
|
-
'temperature', 'seed', 'maxTokens', 'timeoutMs', 'concurrency',
|
|
52
|
-
'budgetDollars', 'suite', 'taskType', 'scoringMethod',
|
|
53
|
-
];
|
|
54
|
-
const out = {};
|
|
55
|
-
for (const key of allowed) {
|
|
56
|
-
if (config[key] !== undefined) out[key] = config[key];
|
|
57
|
-
}
|
|
58
|
-
return out;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
function samplePayloadForHash(benchmark = {}) {
|
|
62
|
-
return {
|
|
63
|
-
id: benchmark.id || benchmark.promptId || benchmark.benchmark_id || null,
|
|
64
|
-
prompt: benchmark.prompt || '',
|
|
65
|
-
taskType: benchmark.taskType || null,
|
|
66
|
-
difficulty: benchmark.difficulty || null,
|
|
67
|
-
expectedTraits: benchmark.expectedTraits || null,
|
|
68
|
-
expectedInReply: benchmark.expectedInReply || null,
|
|
69
|
-
agentExpectations: benchmark.agentExpectations || null,
|
|
70
|
-
};
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
function buildEvalManifest({
|
|
74
|
-
suite,
|
|
75
|
-
benchmark = {},
|
|
76
|
-
runId,
|
|
77
|
-
provider,
|
|
78
|
-
model,
|
|
79
|
-
runConfig = {},
|
|
80
|
-
scorerVersion = DEFAULT_SCORER_VERSION,
|
|
81
|
-
evaluatorVersion = DEFAULT_EVALUATOR_VERSION,
|
|
82
|
-
scoringMethod,
|
|
83
|
-
artifactPath,
|
|
84
|
-
trusted,
|
|
85
|
-
} = {}) {
|
|
86
|
-
const sampleId = benchmark.sampleId || benchmark.id || benchmark.promptId || benchmark.benchmark_id || null;
|
|
87
|
-
const datasetVersion = benchmark.datasetVersion || `${suite || 'unknown'}:${DEFAULT_DATASET_VERSION}`;
|
|
88
|
-
const datasetHash = benchmark.datasetHash || hashObject({
|
|
89
|
-
suite: suite || 'unknown',
|
|
90
|
-
datasetVersion,
|
|
91
|
-
sample: samplePayloadForHash(benchmark),
|
|
92
|
-
});
|
|
93
|
-
const promptHash = sha256(benchmark.prompt || '');
|
|
94
|
-
const sanitizedConfig = pickRunConfig({ ...runConfig, suite, scoringMethod });
|
|
95
|
-
const repoSha = getRepoSha();
|
|
96
|
-
|
|
97
|
-
const manifest = {
|
|
98
|
-
runId: runId || null,
|
|
99
|
-
suite: suite || null,
|
|
100
|
-
datasetVersion,
|
|
101
|
-
datasetHash,
|
|
102
|
-
sampleId,
|
|
103
|
-
promptHash,
|
|
104
|
-
provider: provider || null,
|
|
105
|
-
model: model || null,
|
|
106
|
-
modelSnapshot: benchmark.modelSnapshot || model || null,
|
|
107
|
-
scorerVersion,
|
|
108
|
-
evaluatorVersion,
|
|
109
|
-
scoringMethod: scoringMethod || null,
|
|
110
|
-
repoSha,
|
|
111
|
-
runConfig: sanitizedConfig,
|
|
112
|
-
artifactPath: artifactPath || null,
|
|
113
|
-
trusted: trusted === undefined ? null : !!trusted,
|
|
114
|
-
};
|
|
115
|
-
|
|
116
|
-
return {
|
|
117
|
-
sampleId,
|
|
118
|
-
datasetVersion,
|
|
119
|
-
datasetHash,
|
|
120
|
-
promptHash,
|
|
121
|
-
repoSha,
|
|
122
|
-
scorerVersion,
|
|
123
|
-
evaluatorVersion,
|
|
124
|
-
scoringMethod: scoringMethod || null,
|
|
125
|
-
runConfigJson: safeJson(sanitizedConfig),
|
|
126
|
-
evalManifestJson: safeJson(manifest),
|
|
127
|
-
artifactPath: artifactPath || null,
|
|
128
|
-
modelSnapshot: benchmark.modelSnapshot || model || null,
|
|
129
|
-
temperature: sanitizedConfig.temperature ?? null,
|
|
130
|
-
seed: sanitizedConfig.seed ?? null,
|
|
131
|
-
};
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
function decorateBenchmarkResult(entry = {}, context = {}) {
|
|
135
|
-
const suite = entry.suite || context.suite;
|
|
136
|
-
const benchmark = {
|
|
137
|
-
...(context.benchmark || {}),
|
|
138
|
-
id: entry.promptId || entry.benchmark_id || context.benchmark?.id,
|
|
139
|
-
prompt: entry.prompt || context.benchmark?.prompt,
|
|
140
|
-
taskType: entry.taskType || context.benchmark?.taskType,
|
|
141
|
-
difficulty: entry.difficulty || context.benchmark?.difficulty,
|
|
142
|
-
};
|
|
143
|
-
const manifest = buildEvalManifest({
|
|
144
|
-
suite,
|
|
145
|
-
benchmark,
|
|
146
|
-
runId: entry.runId || context.runId,
|
|
147
|
-
provider: entry.provider || context.provider,
|
|
148
|
-
model: entry.model || context.model,
|
|
149
|
-
runConfig: context.runConfig || {},
|
|
150
|
-
scorerVersion: entry.scorerVersion || context.scorerVersion,
|
|
151
|
-
evaluatorVersion: entry.evaluatorVersion || context.evaluatorVersion,
|
|
152
|
-
scoringMethod: entry.scoringMethod || context.scoringMethod,
|
|
153
|
-
artifactPath: entry.artifactPath || context.artifactPath,
|
|
154
|
-
trusted: entry.trusted ?? context.trusted,
|
|
155
|
-
});
|
|
156
|
-
|
|
157
|
-
return {
|
|
158
|
-
...entry,
|
|
159
|
-
sampleId: entry.sampleId || manifest.sampleId,
|
|
160
|
-
datasetVersion: entry.datasetVersion || manifest.datasetVersion,
|
|
161
|
-
datasetHash: entry.datasetHash || manifest.datasetHash,
|
|
162
|
-
promptHash: entry.promptHash || manifest.promptHash,
|
|
163
|
-
repoSha: entry.repoSha || manifest.repoSha,
|
|
164
|
-
scorerVersion: entry.scorerVersion || manifest.scorerVersion,
|
|
165
|
-
evaluatorVersion: entry.evaluatorVersion || manifest.evaluatorVersion,
|
|
166
|
-
scoringMethod: entry.scoringMethod || manifest.scoringMethod,
|
|
167
|
-
runConfigJson: entry.runConfigJson || manifest.runConfigJson,
|
|
168
|
-
evalManifestJson: entry.evalManifestJson || manifest.evalManifestJson,
|
|
169
|
-
artifactPath: entry.artifactPath || manifest.artifactPath,
|
|
170
|
-
modelSnapshot: entry.modelSnapshot || manifest.modelSnapshot,
|
|
171
|
-
temperature: entry.temperature ?? manifest.temperature,
|
|
172
|
-
seed: entry.seed ?? manifest.seed,
|
|
173
|
-
};
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
module.exports = {
|
|
177
|
-
DEFAULT_DATASET_VERSION,
|
|
178
|
-
DEFAULT_SCORER_VERSION,
|
|
179
|
-
DEFAULT_EVALUATOR_VERSION,
|
|
180
|
-
stableStringify,
|
|
181
|
-
sha256,
|
|
182
|
-
hashObject,
|
|
183
|
-
getRepoSha,
|
|
184
|
-
buildEvalManifest,
|
|
185
|
-
decorateBenchmarkResult,
|
|
186
|
-
};
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
const path = require('node:path');
|
|
4
|
-
|
|
5
|
-
const { runAgentBenchmark } = require('../../agent-runner');
|
|
6
|
-
|
|
7
|
-
function createCodingAgentAdapter({
|
|
8
|
-
runAgentLoop,
|
|
9
|
-
provider = null,
|
|
10
|
-
model = null,
|
|
11
|
-
timeoutMs = null,
|
|
12
|
-
benchmarks = null,
|
|
13
|
-
} = {}) {
|
|
14
|
-
const suite = benchmarks || loadCodingAgentBenchmarks();
|
|
15
|
-
const byId = new Map(suite.map((benchmark) => [benchmark.id, benchmark]));
|
|
16
|
-
return {
|
|
17
|
-
id: 'coding-agent',
|
|
18
|
-
listTasks(taskIds = []) {
|
|
19
|
-
return taskIds.map((taskId) => {
|
|
20
|
-
const benchmark = byId.get(taskId);
|
|
21
|
-
if (!benchmark) throw new Error(`Unknown coding-agent benchmark: ${taskId}`);
|
|
22
|
-
return benchmark;
|
|
23
|
-
});
|
|
24
|
-
},
|
|
25
|
-
async runTask({ taskId, harness, candidateId, artifactDir, domainSpec } = {}) {
|
|
26
|
-
const benchmark = byId.get(taskId);
|
|
27
|
-
if (!benchmark) throw new Error(`Unknown coding-agent benchmark: ${taskId}`);
|
|
28
|
-
if (!runAgentLoop) throw new Error('runAgentLoop is required for coding-agent adapter');
|
|
29
|
-
const adaptedBenchmark = {
|
|
30
|
-
...benchmark,
|
|
31
|
-
originalPrompt: benchmark.prompt,
|
|
32
|
-
prompt: harness?.transformPrompt
|
|
33
|
-
? harness.transformPrompt(benchmark.prompt, { benchmark, taskId, candidateId, domainSpec })
|
|
34
|
-
: benchmark.prompt,
|
|
35
|
-
};
|
|
36
|
-
const baseOptions = { runAgentLoop, provider, model, timeoutMs, artifactDir };
|
|
37
|
-
const runOptions = harness?.buildRunOptions
|
|
38
|
-
? harness.buildRunOptions({ benchmark: adaptedBenchmark, taskId, candidateId, domainSpec, options: baseOptions })
|
|
39
|
-
: baseOptions;
|
|
40
|
-
return runAgentBenchmark(adaptedBenchmark, {
|
|
41
|
-
...baseOptions,
|
|
42
|
-
...runOptions,
|
|
43
|
-
artifactDir,
|
|
44
|
-
runAgentLoop: runOptions.runAgentLoop || runAgentLoop,
|
|
45
|
-
});
|
|
46
|
-
},
|
|
47
|
-
};
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
function loadCodingAgentBenchmarks() {
|
|
51
|
-
return require(path.resolve(__dirname, '../../benchmarks/coding-agent.json'));
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
module.exports = {
|
|
55
|
-
createCodingAgentAdapter,
|
|
56
|
-
loadCodingAgentBenchmarks,
|
|
57
|
-
};
|