create-walle 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -5
- package/package.json +2 -2
- package/template/CLAUDE.md +2 -2
- package/template/LICENSE +1 -1
- package/template/bin/ctm-dev-cleanup.js +24 -3
- package/template/bin/ctm-launch.sh +13 -0
- package/template/bin/dev.sh +156 -18
- package/template/bin/node-bin.sh +84 -0
- package/template/bin/pin-node.sh +51 -0
- package/template/claude-task-manager/api-prompts.js +1203 -182
- package/template/claude-task-manager/api-reviews.js +109 -15
- package/template/claude-task-manager/approval-agent.js +1360 -280
- package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
- package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
- package/template/claude-task-manager/db.js +4417 -295
- package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
- package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
- package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
- package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
- package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
- package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
- package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
- package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
- package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
- package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
- package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
- package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
- package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
- package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
- package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
- package/template/claude-task-manager/docs/phone-access-design.md +53 -15
- package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
- package/template/claude-task-manager/docs/phone-setup.md +3 -0
- package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
- package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
- package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
- package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
- package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
- package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
- package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
- package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
- package/template/claude-task-manager/docs/session-title-authority.md +32 -0
- package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
- package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
- package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
- package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
- package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
- package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
- package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
- package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
- package/template/claude-task-manager/git-utils.js +897 -27
- package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
- package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
- package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
- package/template/claude-task-manager/lib/agent-presets.js +17 -1
- package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
- package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
- package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
- package/template/claude-task-manager/lib/async-semaphore.js +44 -0
- package/template/claude-task-manager/lib/auth-context.js +5 -0
- package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
- package/template/claude-task-manager/lib/auth-rules.js +29 -2
- package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
- package/template/claude-task-manager/lib/background-llm.js +144 -17
- package/template/claude-task-manager/lib/branch-inventory.js +212 -0
- package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
- package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
- package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
- package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
- package/template/claude-task-manager/lib/codex-zst.js +124 -0
- package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
- package/template/claude-task-manager/lib/connection-health.js +232 -0
- package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
- package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
- package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
- package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
- package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
- package/template/claude-task-manager/lib/document-review.js +141 -6
- package/template/claude-task-manager/lib/escalation-review.js +152 -0
- package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
- package/template/claude-task-manager/lib/headless-term-service.js +678 -0
- package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
- package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
- package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
- package/template/claude-task-manager/lib/main-db-census.js +216 -0
- package/template/claude-task-manager/lib/message-pagination.js +106 -4
- package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
- package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
- package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
- package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
- package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
- package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
- package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
- package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
- package/template/claude-task-manager/lib/perf-tracker.js +242 -6
- package/template/claude-task-manager/lib/permission-match.js +76 -0
- package/template/claude-task-manager/lib/permission-sync.js +133 -20
- package/template/claude-task-manager/lib/process-title.js +35 -0
- package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
- package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
- package/template/claude-task-manager/lib/prompt-intent.js +132 -0
- package/template/claude-task-manager/lib/provider-user-context.js +34 -0
- package/template/claude-task-manager/lib/read-pool-client.js +313 -0
- package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
- package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
- package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
- package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
- package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
- package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
- package/template/claude-task-manager/lib/restart-guard.js +109 -0
- package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
- package/template/claude-task-manager/lib/restore-policy.js +13 -0
- package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
- package/template/claude-task-manager/lib/restore-runtime.js +68 -0
- package/template/claude-task-manager/lib/restore-storm.js +34 -0
- package/template/claude-task-manager/lib/resume-cwd.js +36 -0
- package/template/claude-task-manager/lib/resume-preflight.js +313 -0
- package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
- package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
- package/template/claude-task-manager/lib/scheduler.js +21 -1
- package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
- package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
- package/template/claude-task-manager/lib/server-listeners.js +239 -0
- package/template/claude-task-manager/lib/session-capture.js +42 -7
- package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
- package/template/claude-task-manager/lib/session-history.js +388 -43
- package/template/claude-task-manager/lib/session-host-manager.js +287 -0
- package/template/claude-task-manager/lib/session-image-refs.js +209 -0
- package/template/claude-task-manager/lib/session-jobs.js +399 -59
- package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
- package/template/claude-task-manager/lib/session-restore.js +53 -0
- package/template/claude-task-manager/lib/session-standup.js +123 -23
- package/template/claude-task-manager/lib/session-state-bus.js +14 -0
- package/template/claude-task-manager/lib/session-stream.js +64 -16
- package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
- package/template/claude-task-manager/lib/session-token-usage.js +494 -0
- package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
- package/template/claude-task-manager/lib/setup-network-config.js +9 -0
- package/template/claude-task-manager/lib/size-cap.js +45 -0
- package/template/claude-task-manager/lib/size-cap.test.js +62 -0
- package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
- package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
- package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
- package/template/claude-task-manager/lib/standup-attention.js +7 -3
- package/template/claude-task-manager/lib/status-authority.js +39 -0
- package/template/claude-task-manager/lib/status-hooks.js +4 -0
- package/template/claude-task-manager/lib/storage-migration.js +235 -0
- package/template/claude-task-manager/lib/structured-capture.js +298 -0
- package/template/claude-task-manager/lib/sync-io-census.js +163 -0
- package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
- package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
- package/template/claude-task-manager/lib/terminal-choice.js +364 -0
- package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
- package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
- package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
- package/template/claude-task-manager/lib/timeline-order.js +122 -0
- package/template/claude-task-manager/lib/transcript-store.js +348 -43
- package/template/claude-task-manager/lib/transport-security.js +84 -1
- package/template/claude-task-manager/lib/wait-state.js +184 -0
- package/template/claude-task-manager/lib/walle-client.js +47 -5
- package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
- package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
- package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
- package/template/claude-task-manager/lib/walle-native-health.js +403 -0
- package/template/claude-task-manager/lib/walle-repair.js +701 -0
- package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
- package/template/claude-task-manager/lib/walle-session-context.js +57 -21
- package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
- package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
- package/template/claude-task-manager/lib/walle-transcript.js +52 -0
- package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
- package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
- package/template/claude-task-manager/package.json +1 -1
- package/template/claude-task-manager/prompt-harvest.js +89 -66
- package/template/claude-task-manager/providers/claude-code.js +51 -3
- package/template/claude-task-manager/providers/cursor.js +140 -45
- package/template/claude-task-manager/public/css/reviews.css +551 -61
- package/template/claude-task-manager/public/css/setup.css +191 -0
- package/template/claude-task-manager/public/css/walle-session.css +865 -10
- package/template/claude-task-manager/public/css/walle.css +154 -0
- package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
- package/template/claude-task-manager/public/index.html +18516 -2058
- package/template/claude-task-manager/public/ipad.html +363 -0
- package/template/claude-task-manager/public/js/document-review-links.js +301 -0
- package/template/claude-task-manager/public/js/image-normalize.js +69 -36
- package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
- package/template/claude-task-manager/public/js/prompts.js +66 -29
- package/template/claude-task-manager/public/js/reviews.js +901 -133
- package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
- package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
- package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
- package/template/claude-task-manager/public/js/setup.js +1273 -176
- package/template/claude-task-manager/public/js/stream-view.js +691 -73
- package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
- package/template/claude-task-manager/public/js/walle-session.js +2455 -158
- package/template/claude-task-manager/public/js/walle.js +455 -28
- package/template/claude-task-manager/public/m/app.css +2909 -262
- package/template/claude-task-manager/public/m/app.js +6601 -398
- package/template/claude-task-manager/public/m/claim.html +224 -17
- package/template/claude-task-manager/public/m/index.html +117 -21
- package/template/claude-task-manager/public/m/sw.js +3 -1
- package/template/claude-task-manager/public/manifest.json +2 -2
- package/template/claude-task-manager/public/prompts.html +30 -14
- package/template/claude-task-manager/queue-engine.js +507 -28
- package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
- package/template/claude-task-manager/server.js +14341 -2197
- package/template/claude-task-manager/session-integrity.js +160 -18
- package/template/claude-task-manager/session-search-ranking.js +1 -0
- package/template/claude-task-manager/session-utils.js +25 -5
- package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
- package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
- package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
- package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
- package/template/claude-task-manager/workers/harvest-worker.js +9 -55
- package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
- package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
- package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
- package/template/claude-task-manager/workers/session-host-process.js +146 -0
- package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
- package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
- package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
- package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
- package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
- package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
- package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
- package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
- package/template/docs/design/markdown-review-pane.md +206 -0
- package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
- package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
- package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
- package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
- package/template/docs/private-memory-and-pii-policy.md +69 -0
- package/template/package.json +2 -1
- package/template/scripts/check-private-data.js +201 -0
- package/template/shared/sqlite-owner-guard.js +30 -0
- package/template/shared/sqlite-owner-write-queue.js +225 -0
- package/template/shared/sqlite-storage-policy.js +111 -0
- package/template/shared/sqlite-write-lock.js +428 -0
- package/template/wall-e/agent-runners/claude-code.js +5 -0
- package/template/wall-e/agent.js +166 -22
- package/template/wall-e/api-walle.js +524 -70
- package/template/wall-e/auth/provider-flows.js +11 -1
- package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
- package/template/wall-e/brain.js +1614 -141
- package/template/wall-e/chat/attachment-blocks.js +96 -0
- package/template/wall-e/chat/attachments.js +2 -1
- package/template/wall-e/chat/capability-resolver.js +7 -7
- package/template/wall-e/chat/context-messages.js +28 -0
- package/template/wall-e/chat/conversation-frame.js +630 -0
- package/template/wall-e/chat/provider-messages.js +125 -0
- package/template/wall-e/chat.js +1002 -233
- package/template/wall-e/coding/acceptance-contract.js +170 -0
- package/template/wall-e/coding/acp-adapter.js +1 -1
- package/template/wall-e/coding/agent-catalog.js +3 -0
- package/template/wall-e/coding/artifact-store.js +93 -0
- package/template/wall-e/coding/capability-router.js +120 -0
- package/template/wall-e/coding/coding-run-controller.js +423 -0
- package/template/wall-e/coding/compaction-service.js +157 -12
- package/template/wall-e/coding/frontend-verification.js +258 -0
- package/template/wall-e/coding/lifecycle-hooks.js +75 -0
- package/template/wall-e/coding/local-preview-contract.js +157 -0
- package/template/wall-e/coding/permission-service.js +57 -13
- package/template/wall-e/coding/prompt-bundle.js +19 -1
- package/template/wall-e/coding/prompt-section-registry.js +227 -0
- package/template/wall-e/coding/provider-compat.js +15 -0
- package/template/wall-e/coding/runtime-events.js +224 -0
- package/template/wall-e/coding/runtime-mode.js +3 -0
- package/template/wall-e/coding/side-git-snapshot.js +160 -4
- package/template/wall-e/coding/snapshot-service.js +143 -1
- package/template/wall-e/coding/stream-processor.js +388 -34
- package/template/wall-e/coding/task-tool.js +141 -4
- package/template/wall-e/coding/tool-execution-controller.js +365 -0
- package/template/wall-e/coding/tool-registry.js +43 -5
- package/template/wall-e/coding/user-hooks.js +217 -0
- package/template/wall-e/coding-orchestrator.js +1330 -221
- package/template/wall-e/coding-prompts.js +20 -4
- package/template/wall-e/context/context-builder.js +15 -2
- package/template/wall-e/decision/confidence.js +1 -1
- package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
- package/template/wall-e/docs/external-action-controller.md +26 -6
- package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
- package/template/wall-e/embeddings.js +591 -53
- package/template/wall-e/external-action-controller.js +12 -0
- package/template/wall-e/http/auth.js +1 -0
- package/template/wall-e/http/chat-api.js +46 -11
- package/template/wall-e/http/model-admin.js +836 -34
- package/template/wall-e/lib/boot-profile.js +88 -0
- package/template/wall-e/lib/event-loop-monitor.js +93 -0
- package/template/wall-e/lib/service-health.js +194 -0
- package/template/wall-e/llm/anthropic.js +130 -5
- package/template/wall-e/llm/client.js +266 -63
- package/template/wall-e/llm/default-fallback.js +382 -0
- package/template/wall-e/llm/health.js +19 -0
- package/template/wall-e/llm/message-guard.js +78 -0
- package/template/wall-e/llm/model-catalog.js +252 -1
- package/template/wall-e/llm/openai.js +26 -4
- package/template/wall-e/llm/portkey-sync.js +654 -0
- package/template/wall-e/llm/provider-error.js +30 -2
- package/template/wall-e/llm/registry.js +5 -1
- package/template/wall-e/llm/request-compat.js +67 -0
- package/template/wall-e/loops/backfill.js +79 -23
- package/template/wall-e/loops/brain-optimize.js +67 -0
- package/template/wall-e/loops/ingest.js +25 -10
- package/template/wall-e/loops/question-digest.js +160 -0
- package/template/wall-e/loops/reflect.js +6 -4
- package/template/wall-e/loops/think.js +39 -12
- package/template/wall-e/mcp-server.js +318 -36
- package/template/wall-e/memory/ctm-context-client.js +52 -14
- package/template/wall-e/memory/ctm-operational-context.js +237 -0
- package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
- package/template/wall-e/memory/ctm-session-context.js +111 -63
- package/template/wall-e/prompts/coding/deepseek.txt +3 -0
- package/template/wall-e/prompts/coding/gemini.txt +6 -0
- package/template/wall-e/prompts/coding/gpt.txt +6 -0
- package/template/wall-e/prompts/coding/local.txt +7 -0
- package/template/wall-e/runtime/decision-hooks.js +115 -0
- package/template/wall-e/runtime/devbox-gateway.js +82 -8
- package/template/wall-e/runtime/prompt-manifest.js +86 -0
- package/template/wall-e/runtime/tool-executor.js +269 -0
- package/template/wall-e/runtime/tool-result-envelope.js +138 -0
- package/template/wall-e/runtime/transcript-projection.js +60 -0
- package/template/wall-e/runtime/walle-runtime.js +224 -0
- package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
- package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
- package/template/wall-e/server.js +15 -0
- package/template/wall-e/session-files.js +9 -0
- package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
- package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
- package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
- package/template/wall-e/skills/claude-code-reader.js +7 -3
- package/template/wall-e/skills/script-skill-runner.js +10 -0
- package/template/wall-e/skills/skill-planner.js +38 -0
- package/template/wall-e/tools/builtin-middleware.js +19 -9
- package/template/wall-e/tools/local-tools.js +1428 -16
- package/template/wall-e/tools/permission-checker.js +73 -5
- package/template/wall-e/tools/question-manager.js +117 -7
- package/template/wall-e/training/harvester.js +12 -28
- package/template/wall-e/training/replay.js +25 -80
- package/template/website/index.html +10 -10
- package/template/wall-e/eval/ab-test.js +0 -203
- package/template/wall-e/eval/agent-runner.js +0 -772
- package/template/wall-e/eval/agent-scorer.js +0 -461
- package/template/wall-e/eval/aggregator.js +0 -414
- package/template/wall-e/eval/allowed-test-commands.js +0 -34
- package/template/wall-e/eval/benchmark-generator.js +0 -113
- package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
- package/template/wall-e/eval/benchmarks/chat.json +0 -82
- package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
- package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
- package/template/wall-e/eval/benchmarks/coding.json +0 -122
- package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
- package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
- package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
- package/template/wall-e/eval/benchmarks.js +0 -669
- package/template/wall-e/eval/cc-replay.js +0 -719
- package/template/wall-e/eval/chat-eval.js +0 -525
- package/template/wall-e/eval/check-keys.js +0 -15
- package/template/wall-e/eval/check-providers.js +0 -42
- package/template/wall-e/eval/codex-cli-baseline.js +0 -669
- package/template/wall-e/eval/coding-agent-real.js +0 -570
- package/template/wall-e/eval/context-compactor.js +0 -251
- package/template/wall-e/eval/debug-agent003.js +0 -68
- package/template/wall-e/eval/diagnostics.js +0 -216
- package/template/wall-e/eval/eval-orchestrator.js +0 -642
- package/template/wall-e/eval/evaluate.js +0 -202
- package/template/wall-e/eval/evaluator.js +0 -373
- package/template/wall-e/eval/exporter.js +0 -212
- package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
- package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
- package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
- package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
- package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
- package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
- package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
- package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
- package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
- package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
- package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
- package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
- package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
- package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
- package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
- package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
- package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
- package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
- package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
- package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
- package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
- package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
- package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
- package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
- package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
- package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
- package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
- package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
- package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
- package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
- package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
- package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
- package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
- package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
- package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
- package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
- package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
- package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
- package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
- package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
- package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
- package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
- package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
- package/template/wall-e/eval/harvester.js +0 -685
- package/template/wall-e/eval/head-to-head.js +0 -388
- package/template/wall-e/eval/humaneval-adapter.js +0 -321
- package/template/wall-e/eval/list-models.js +0 -31
- package/template/wall-e/eval/livecodebench-adapter.js +0 -291
- package/template/wall-e/eval/mail-integration.js +0 -443
- package/template/wall-e/eval/manifest.js +0 -186
- package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
- package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
- package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
- package/template/wall-e/eval/meta-harness/cli.js +0 -86
- package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
- package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
- package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
- package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
- package/template/wall-e/eval/meta-harness/frontier.js +0 -96
- package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
- package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
- package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
- package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
- package/template/wall-e/eval/meta-harness/reporting.js +0 -58
- package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
- package/template/wall-e/eval/meta-harness/validation.js +0 -81
- package/template/wall-e/eval/promoter.js +0 -228
- package/template/wall-e/eval/provider-normalizer.js +0 -33
- package/template/wall-e/eval/replay.js +0 -395
- package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
- package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
- package/template/wall-e/eval/run-coding-agent-real.js +0 -187
- package/template/wall-e/eval/run-eval.js +0 -435
- package/template/wall-e/eval/run-model-comparison.js +0 -142
- package/template/wall-e/eval/session-evaluator.js +0 -187
- package/template/wall-e/eval/session-miner.js +0 -207
- package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
- package/template/wall-e/eval/session-transcripts.js +0 -509
- package/template/wall-e/eval/shadow.js +0 -161
- package/template/wall-e/eval/swebench-adapter.js +0 -345
- package/template/wall-e/eval/swebench-docker.js +0 -192
- package/template/wall-e/eval/train.py +0 -320
- package/template/wall-e/eval/trainer.js +0 -232
- package/template/wall-e/eval/weekly-eval-loop.js +0 -241
|
@@ -1,719 +0,0 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* Claude Code session replay — runs past Claude Code sessions through
|
|
5
|
-
* the wall-e coding agent in isolated git worktrees, scores wall-e's
|
|
6
|
-
* output against Claude's actual tool calls + diff for the same task
|
|
7
|
-
* on the same repo state.
|
|
8
|
-
*
|
|
9
|
-
* Pipeline:
|
|
10
|
-
* 1. pickRecentSessions — walks ~/.claude/projects, filters by cwd + mtime
|
|
11
|
-
* 2. recoverRepoState — finds git commit at session start time
|
|
12
|
-
* 3. makeReplaySandbox — git worktree add at recovered commit
|
|
13
|
-
* 4. replayAndScore — runs wall-e on concatenated user messages,
|
|
14
|
-
* scores tool/file Jaccard + LLM judge
|
|
15
|
-
* 5. removeReplaySandbox — git worktree remove
|
|
16
|
-
*
|
|
17
|
-
* v1 uses concatenated-prompt scripted multi-turn (joins user messages
|
|
18
|
-
* with `\n\n[user follow-up]\n`) — coding-orchestrator's resumeFromCheckpoint
|
|
19
|
-
* does not natively support injecting new user messages mid-thread.
|
|
20
|
-
*/
|
|
21
|
-
|
|
22
|
-
const fs = require('fs');
|
|
23
|
-
const path = require('path');
|
|
24
|
-
const os = require('os');
|
|
25
|
-
const { execFileSync } = require('child_process');
|
|
26
|
-
|
|
27
|
-
const {
|
|
28
|
-
findJsonlFiles,
|
|
29
|
-
extractContent,
|
|
30
|
-
extractToolCalls,
|
|
31
|
-
findCommitsInWindow,
|
|
32
|
-
getCommitDiff,
|
|
33
|
-
} = require('./harvester');
|
|
34
|
-
|
|
35
|
-
const CLAUDE_PROJECTS_DIR = path.join(os.homedir(), '.claude', 'projects');
|
|
36
|
-
const SANDBOX_PREFIX = '/tmp/cc-replay-';
|
|
37
|
-
const MIN_PROMPT_CHARS = 20;
|
|
38
|
-
|
|
39
|
-
// ---------------------------------------------------------------------------
|
|
40
|
-
// pickRecentSessions
|
|
41
|
-
// ---------------------------------------------------------------------------
|
|
42
|
-
|
|
43
|
-
/**
|
|
44
|
-
* Walk Claude Code session JSONLs and return candidates suitable for replay.
|
|
45
|
-
*
|
|
46
|
-
* @param {object} opts
|
|
47
|
-
* @param {string} opts.repoPath - Filter to sessions whose `cwd` is under this path
|
|
48
|
-
* @param {number} [opts.sinceDays] - Only consider files mtime within this window
|
|
49
|
-
* @param {number} [opts.limit] - Max sessions to return (sorted by mtime desc)
|
|
50
|
-
* @returns {Array<{
|
|
51
|
-
* sessionId: string, jsonlPath: string, cwd: string, gitBranch: string|null,
|
|
52
|
-
* userMessages: string[], tsStart: string, tsEnd: string, turnCount: number,
|
|
53
|
-
* claudeToolCalls: string[], claudeFilesEdited: string[]
|
|
54
|
-
* }>}
|
|
55
|
-
*/
|
|
56
|
-
function pickRecentSessions({ repoPath, sinceDays = 14, limit = 5 } = {}) {
|
|
57
|
-
if (!repoPath) throw new Error('repoPath is required');
|
|
58
|
-
if (!fs.existsSync(CLAUDE_PROJECTS_DIR)) return [];
|
|
59
|
-
|
|
60
|
-
const sinceMs = Date.now() - sinceDays * 24 * 60 * 60 * 1000;
|
|
61
|
-
const candidates = [];
|
|
62
|
-
|
|
63
|
-
for (const jsonlPath of findJsonlFiles(CLAUDE_PROJECTS_DIR)) {
|
|
64
|
-
let stat;
|
|
65
|
-
try { stat = fs.statSync(jsonlPath); } catch { continue; }
|
|
66
|
-
if (stat.mtime.getTime() < sinceMs) continue;
|
|
67
|
-
|
|
68
|
-
const session = parseSessionJsonl(jsonlPath, repoPath);
|
|
69
|
-
if (!session) continue;
|
|
70
|
-
if (session.userMessages.length < 1) continue;
|
|
71
|
-
if (session.userMessages[0].length < MIN_PROMPT_CHARS) continue;
|
|
72
|
-
|
|
73
|
-
candidates.push({ ...session, _mtime: stat.mtime.getTime() });
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
candidates.sort((a, b) => b._mtime - a._mtime);
|
|
77
|
-
return candidates.slice(0, limit).map(({ _mtime: _, ...rest }) => rest);
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Parse a JSONL session file. Returns null if the session's cwd does not
|
|
82
|
-
* fall under repoPath, or if the file has no usable user messages.
|
|
83
|
-
*/
|
|
84
|
-
function parseSessionJsonl(jsonlPath, repoPath) {
|
|
85
|
-
let lines;
|
|
86
|
-
try { lines = fs.readFileSync(jsonlPath, 'utf8').split('\n').filter(Boolean); }
|
|
87
|
-
catch { return null; }
|
|
88
|
-
|
|
89
|
-
const userMessages = [];
|
|
90
|
-
const claudeToolCalls = [];
|
|
91
|
-
const claudeFilesEditedSet = new Set();
|
|
92
|
-
let cwd = null;
|
|
93
|
-
let gitBranch = null;
|
|
94
|
-
let tsStart = null;
|
|
95
|
-
let tsEnd = null;
|
|
96
|
-
const sessionId = path.basename(jsonlPath, '.jsonl');
|
|
97
|
-
|
|
98
|
-
for (const line of lines) {
|
|
99
|
-
let evt;
|
|
100
|
-
try { evt = JSON.parse(line); } catch { continue; }
|
|
101
|
-
|
|
102
|
-
if (evt.cwd && !cwd) cwd = evt.cwd;
|
|
103
|
-
if (evt.gitBranch && !gitBranch) gitBranch = evt.gitBranch;
|
|
104
|
-
if (evt.timestamp) {
|
|
105
|
-
if (!tsStart) tsStart = evt.timestamp;
|
|
106
|
-
tsEnd = evt.timestamp;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
if (evt.type === 'user' && evt.message?.role === 'user') {
|
|
110
|
-
const text = extractContent(evt.message);
|
|
111
|
-
if (text && text.trim().length >= MIN_PROMPT_CHARS && !looksLikeToolResult(text)) {
|
|
112
|
-
userMessages.push(text);
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
if (evt.type === 'assistant' && evt.message?.role === 'assistant') {
|
|
117
|
-
for (const tc of extractToolCalls(evt.message)) {
|
|
118
|
-
if (tc.name) claudeToolCalls.push(tc.name);
|
|
119
|
-
// Only count files actually MODIFIED — read_file/glob/grep don't
|
|
120
|
-
// count toward claudeFilesEdited.
|
|
121
|
-
if (!isEditToolName(tc.name)) continue;
|
|
122
|
-
const fp = editedFileFromToolInput(tc.input || {});
|
|
123
|
-
if (fp) claudeFilesEditedSet.add(normalizeSessionFile(fp, cwd));
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
if (!cwd) return null;
|
|
129
|
-
if (!cwd.startsWith(repoPath.replace(/\/+$/, ''))) return null;
|
|
130
|
-
|
|
131
|
-
return {
|
|
132
|
-
sessionId,
|
|
133
|
-
jsonlPath,
|
|
134
|
-
cwd,
|
|
135
|
-
gitBranch,
|
|
136
|
-
userMessages,
|
|
137
|
-
tsStart: tsStart || new Date(0).toISOString(),
|
|
138
|
-
tsEnd: tsEnd || new Date(0).toISOString(),
|
|
139
|
-
turnCount: userMessages.length,
|
|
140
|
-
claudeToolCalls,
|
|
141
|
-
claudeFilesEdited: [...claudeFilesEditedSet],
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
function looksLikeToolResult(text) {
|
|
146
|
-
if (!text) return true;
|
|
147
|
-
const head = text.slice(0, 400);
|
|
148
|
-
// Wrappers Claude Code injects around system/command/tool plumbing —
|
|
149
|
-
// these are not real user requests, just transcript markers.
|
|
150
|
-
const wrapperPatterns = [
|
|
151
|
-
/^\s*\[Request interrupted/i,
|
|
152
|
-
/^\s*<tool_use_error>/i,
|
|
153
|
-
/^\s*<system-reminder>/i,
|
|
154
|
-
/^\s*<local-command-caveat>/i,
|
|
155
|
-
/^\s*<local-command-stdout>/i,
|
|
156
|
-
/^\s*<local-command-stderr>/i,
|
|
157
|
-
/^\s*<command-message>/i,
|
|
158
|
-
/^\s*<command-name>/i,
|
|
159
|
-
/^\s*<command-args>/i,
|
|
160
|
-
// Skill bodies that Claude Code injects when a slash-command runs.
|
|
161
|
-
// Wall-e doesn't have the same skill, so replaying the body is meaningless.
|
|
162
|
-
/^\s*Base directory for this skill:/i,
|
|
163
|
-
];
|
|
164
|
-
if (wrapperPatterns.some((re) => re.test(head))) return true;
|
|
165
|
-
// Pure slash-command invocation with no follow-up prose
|
|
166
|
-
if (/^\s*\/[a-z][\w:-]+\s*$/i.test(text.trim())) return true;
|
|
167
|
-
return false;
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
function isEditToolName(name) {
|
|
171
|
-
return /^(Edit|Write|MultiEdit|NotebookEdit|Patch|ApplyPatch|edit_file|write_file|apply_patch|multi_edit|str_replace|create_file)$/i.test(name || '');
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
function editedFileFromToolInput(input) {
|
|
175
|
-
return input.path
|
|
176
|
-
|| input.file_path
|
|
177
|
-
|| input.filePath
|
|
178
|
-
|| input.notebook_path
|
|
179
|
-
|| input.notebookPath
|
|
180
|
-
|| null;
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
function normalizeSessionFile(filePath, cwd) {
|
|
184
|
-
if (!filePath || !cwd || !path.isAbsolute(filePath)) return filePath;
|
|
185
|
-
const rel = path.relative(cwd, filePath);
|
|
186
|
-
return rel && !rel.startsWith('..') && !path.isAbsolute(rel) ? rel : filePath;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
// ---------------------------------------------------------------------------
|
|
190
|
-
// recoverRepoState
|
|
191
|
-
// ---------------------------------------------------------------------------
|
|
192
|
-
|
|
193
|
-
/**
|
|
194
|
-
* Find the git commit that was HEAD at session-start time.
|
|
195
|
-
* Strategy: `git log --before=<tsStart> -n 1` — picks the most recent
|
|
196
|
-
* commit reachable from any branch whose date is <= session start.
|
|
197
|
-
* Returns null if the path isn't a git repo or has no commits at all.
|
|
198
|
-
*/
|
|
199
|
-
function recoverRepoState(repoPath, tsStart) {
|
|
200
|
-
if (!fs.existsSync(path.join(repoPath, '.git'))) return null;
|
|
201
|
-
try {
|
|
202
|
-
const sha = execFileSync(
|
|
203
|
-
'git', ['log', '--all', `--before=${new Date(tsStart).toISOString()}`, '-n', '1', '--format=%H'],
|
|
204
|
-
{ cwd: repoPath, encoding: 'utf8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
|
|
205
|
-
).trim();
|
|
206
|
-
if (!sha) return null;
|
|
207
|
-
return { commitSha: sha, branch: null };
|
|
208
|
-
} catch {
|
|
209
|
-
return null;
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// ---------------------------------------------------------------------------
|
|
214
|
-
// sandbox lifecycle (git worktree)
|
|
215
|
-
// ---------------------------------------------------------------------------
|
|
216
|
-
|
|
217
|
-
function makeReplaySandbox(repoPath, commitSha, sessionId) {
|
|
218
|
-
const sandbox = `${SANDBOX_PREFIX}${sessionId}`;
|
|
219
|
-
if (fs.existsSync(sandbox)) removeReplaySandbox(sandbox, repoPath);
|
|
220
|
-
execFileSync('git', ['worktree', 'add', '--detach', sandbox, commitSha], {
|
|
221
|
-
cwd: repoPath, stdio: 'pipe', timeout: 30000,
|
|
222
|
-
});
|
|
223
|
-
return sandbox;
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
function removeReplaySandbox(sandbox, repoPath) {
|
|
227
|
-
try {
|
|
228
|
-
execFileSync('git', ['worktree', 'remove', '--force', sandbox], {
|
|
229
|
-
cwd: repoPath, stdio: 'pipe', timeout: 30000,
|
|
230
|
-
});
|
|
231
|
-
} catch {
|
|
232
|
-
// fall through to fs cleanup
|
|
233
|
-
}
|
|
234
|
-
try {
|
|
235
|
-
if (fs.existsSync(sandbox)) fs.rmSync(sandbox, { recursive: true, force: true });
|
|
236
|
-
} catch { /* best-effort */ }
|
|
237
|
-
try {
|
|
238
|
-
execFileSync('git', ['worktree', 'prune'], { cwd: repoPath, stdio: 'pipe', timeout: 10000 });
|
|
239
|
-
} catch { /* best-effort */ }
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
/**
|
|
243
|
-
* Sweep stale `cc-replay-*` worktrees from /tmp. Run at start of a batch
|
|
244
|
-
* to clean up after crashes.
|
|
245
|
-
*/
|
|
246
|
-
function gitWorktreeCleanup(repoPath) {
|
|
247
|
-
try {
|
|
248
|
-
const list = execFileSync('git', ['worktree', 'list', '--porcelain'], {
|
|
249
|
-
cwd: repoPath, encoding: 'utf8', timeout: 10000,
|
|
250
|
-
});
|
|
251
|
-
const stale = (list.match(new RegExp(`worktree (${SANDBOX_PREFIX}[^\\n]+)`, 'g')) || [])
|
|
252
|
-
.map(l => l.replace(/^worktree /, ''));
|
|
253
|
-
for (const sb of stale) removeReplaySandbox(sb, repoPath);
|
|
254
|
-
} catch { /* best-effort */ }
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// ---------------------------------------------------------------------------
|
|
258
|
-
// replayAndScore
|
|
259
|
-
// ---------------------------------------------------------------------------
|
|
260
|
-
|
|
261
|
-
/**
|
|
262
|
-
* Replay a Claude Code session through wall-e and score the result.
|
|
263
|
-
*
|
|
264
|
-
* @param {object} session - From pickRecentSessions
|
|
265
|
-
* @param {string} sandbox - Path returned by makeReplaySandbox
|
|
266
|
-
* @param {Function} runAgentLoop - From require('../coding-orchestrator')
|
|
267
|
-
* @param {object} [opts]
|
|
268
|
-
* @param {boolean} [opts.useLlmJudge=true]
|
|
269
|
-
* @param {string} [opts.model]
|
|
270
|
-
* @param {object} [opts.provider]
|
|
271
|
-
* @param {number} [opts.timeoutMs=600000]
|
|
272
|
-
* @returns {Promise<object>} {sessionId, scores, walleTools, claudeTools, walleDiff, claudeDiff, latencyMs, error?}
|
|
273
|
-
*/
|
|
274
|
-
async function replayAndScore(session, sandbox, runAgentLoop, opts = {}) {
|
|
275
|
-
const { useLlmJudge = true, model, provider, timeoutMs = 600_000 } = opts;
|
|
276
|
-
const startTime = Date.now();
|
|
277
|
-
|
|
278
|
-
const concatenatedPrompt = session.userMessages.length === 1
|
|
279
|
-
? session.userMessages[0]
|
|
280
|
-
: session.userMessages
|
|
281
|
-
.map((msg, i) => i === 0 ? msg : `\n\n[user follow-up ${i + 1}]\n${msg}`)
|
|
282
|
-
.join('');
|
|
283
|
-
|
|
284
|
-
let agentResult;
|
|
285
|
-
try {
|
|
286
|
-
const agentPromise = runAgentLoop(concatenatedPrompt, {
|
|
287
|
-
cwd: sandbox,
|
|
288
|
-
timeoutMs,
|
|
289
|
-
provider,
|
|
290
|
-
model,
|
|
291
|
-
mode: 'build',
|
|
292
|
-
persistTranscript: false,
|
|
293
|
-
});
|
|
294
|
-
const hardTimeout = new Promise((_, reject) =>
|
|
295
|
-
setTimeout(() => reject(new Error('cc-replay hard timeout exceeded')), timeoutMs + 60_000)
|
|
296
|
-
);
|
|
297
|
-
agentResult = await Promise.race([agentPromise, hardTimeout]);
|
|
298
|
-
} catch (e) {
|
|
299
|
-
return {
|
|
300
|
-
sessionId: session.sessionId,
|
|
301
|
-
turnCount: session.turnCount,
|
|
302
|
-
error: e.message,
|
|
303
|
-
latencyMs: Date.now() - startTime,
|
|
304
|
-
scores: { composite: 0, tool_jaccard: 0, file_jaccard: 0, judge_score: 0 },
|
|
305
|
-
walleTools: [],
|
|
306
|
-
claudeTools: session.claudeToolCalls,
|
|
307
|
-
walleDiff: '',
|
|
308
|
-
claudeDiff: '',
|
|
309
|
-
};
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
const latencyMs = Date.now() - startTime;
|
|
313
|
-
const walleTools = extractWalleTools(agentResult);
|
|
314
|
-
const claudeTools = canonicalizeToolList(session.claudeToolCalls);
|
|
315
|
-
const walleFiles = extractWalleFiles(agentResult, sandbox);
|
|
316
|
-
const walleDiff = computeSandboxDiff(sandbox);
|
|
317
|
-
const claudeDiff = computeClaudeDiff(session);
|
|
318
|
-
|
|
319
|
-
// ---- scoring -----------------------------------------------------------
|
|
320
|
-
const tool_jaccard = jaccard(new Set(walleTools), new Set(claudeTools));
|
|
321
|
-
const file_jaccard = jaccard(
|
|
322
|
-
canonicalizeReplayFiles(walleFiles),
|
|
323
|
-
canonicalizeReplayFiles(session.claudeFilesEdited),
|
|
324
|
-
);
|
|
325
|
-
|
|
326
|
-
let judge_score = null;
|
|
327
|
-
let judge_reason = null;
|
|
328
|
-
if (useLlmJudge && (walleDiff || claudeDiff)) {
|
|
329
|
-
try {
|
|
330
|
-
const judged = await scoreLlmJudge({
|
|
331
|
-
prompt: session.userMessages[0],
|
|
332
|
-
walleDiff,
|
|
333
|
-
claudeDiff,
|
|
334
|
-
model,
|
|
335
|
-
});
|
|
336
|
-
judge_score = judged.score;
|
|
337
|
-
judge_reason = judged.reason;
|
|
338
|
-
} catch (e) {
|
|
339
|
-
judge_reason = `judge failed: ${e.message}`;
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
// Composite: 0.25·tool + 0.25·file + 0.50·judge (when judge available);
|
|
344
|
-
// otherwise 0.5/0.5 split between tool+file.
|
|
345
|
-
let composite;
|
|
346
|
-
if (judge_score != null) {
|
|
347
|
-
composite = 0.25 * tool_jaccard + 0.25 * file_jaccard + 0.50 * judge_score;
|
|
348
|
-
} else {
|
|
349
|
-
composite = 0.5 * tool_jaccard + 0.5 * file_jaccard;
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
return {
|
|
353
|
-
sessionId: session.sessionId,
|
|
354
|
-
turnCount: session.turnCount,
|
|
355
|
-
latencyMs,
|
|
356
|
-
scores: { composite, tool_jaccard, file_jaccard, judge_score, judge_reason },
|
|
357
|
-
walleTools,
|
|
358
|
-
walleFiles,
|
|
359
|
-
claudeTools,
|
|
360
|
-
claudeFiles: session.claudeFilesEdited,
|
|
361
|
-
walleDiff: walleDiff.slice(0, 8000),
|
|
362
|
-
claudeDiff: (claudeDiff || '').slice(0, 8000),
|
|
363
|
-
usage: agentResult.usage || null,
|
|
364
|
-
agentResult: summarizeAgentResult(agentResult),
|
|
365
|
-
};
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
// ---------------------------------------------------------------------------
|
|
369
|
-
// scoring helpers
|
|
370
|
-
// ---------------------------------------------------------------------------
|
|
371
|
-
|
|
372
|
-
function jaccard(setA, setB) {
|
|
373
|
-
if (!setA.size && !setB.size) return 1;
|
|
374
|
-
const intersection = [...setA].filter(x => setB.has(x)).length;
|
|
375
|
-
const union = new Set([...setA, ...setB]).size;
|
|
376
|
-
return union === 0 ? 0 : intersection / union;
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
function canonicalizeReplayFiles(files = []) {
|
|
380
|
-
return new Set((files || []).map(canonicalReplayFilePath).filter(Boolean));
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
function canonicalReplayFilePath(filePath) {
|
|
384
|
-
const normalized = String(filePath || '').trim().replace(/\\/g, '/').replace(/^\.\//, '');
|
|
385
|
-
const templatePrefix = 'create-walle/template/';
|
|
386
|
-
if (normalized.startsWith(templatePrefix)) {
|
|
387
|
-
return normalized.slice(templatePrefix.length);
|
|
388
|
-
}
|
|
389
|
-
return normalized;
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
function extractWalleTools(agentResult) {
|
|
393
|
-
const log = agentResult?.log || [];
|
|
394
|
-
const tools = [];
|
|
395
|
-
for (const turn of log) {
|
|
396
|
-
for (const tc of turn.toolCalls || []) {
|
|
397
|
-
const canonical = canonicalToolName(tc.name);
|
|
398
|
-
if (canonical) tools.push(canonical);
|
|
399
|
-
}
|
|
400
|
-
}
|
|
401
|
-
return tools;
|
|
402
|
-
}
|
|
403
|
-
|
|
404
|
-
function canonicalizeToolList(tools = []) {
|
|
405
|
-
return (tools || []).map(canonicalToolName).filter(Boolean);
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
function canonicalToolName(name) {
|
|
409
|
-
const raw = String(name || '').trim();
|
|
410
|
-
if (!raw) return null;
|
|
411
|
-
if (/^(read|read_file)$/i.test(raw)) return 'read_file';
|
|
412
|
-
if (/^(bash|run_shell|exec_command|shell)$/i.test(raw)) return 'run_shell';
|
|
413
|
-
if (/^(grep|glob|toolsearch|list_directory|grep_files|search|rg)$/i.test(raw)) return 'search';
|
|
414
|
-
if (/^(edit|write|multiedit|notebookedit|patch|applypatch|apply_patch|edit_file|write_file|multi_edit|str_replace|create_file)$/i.test(raw)) return 'edit';
|
|
415
|
-
if (/^(agent|task|taskoutput)$/i.test(raw)) return 'agent';
|
|
416
|
-
if (/^askuserquestion$/i.test(raw)) return 'ask_user';
|
|
417
|
-
if (/^skill$/i.test(raw)) return 'skill';
|
|
418
|
-
if (/^mcp__/i.test(raw)) return 'mcp';
|
|
419
|
-
return raw.toLowerCase();
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
function extractWalleFiles(agentResult, sandbox) {
|
|
423
|
-
const files = new Set();
|
|
424
|
-
const log = agentResult?.log || [];
|
|
425
|
-
for (const turn of log) {
|
|
426
|
-
for (const tc of turn.toolCalls || []) {
|
|
427
|
-
if (!isEditToolName(tc.name)) continue;
|
|
428
|
-
const input = tc.input || {};
|
|
429
|
-
const fp = editedFileFromToolInput(input);
|
|
430
|
-
if (fp) {
|
|
431
|
-
files.add(normalizeWalleFilePath(fp, sandbox));
|
|
432
|
-
}
|
|
433
|
-
for (const patchFile of extractPatchFilePaths(input.patch_text || input.patchText || input.patch || '')) {
|
|
434
|
-
files.add(normalizeWalleFilePath(patchFile, sandbox));
|
|
435
|
-
}
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
return [...files];
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
function normalizeWalleFilePath(filePath, sandbox) {
|
|
442
|
-
if (!filePath || !sandbox) return filePath;
|
|
443
|
-
if (!path.isAbsolute(filePath)) return filePath.replace(/^\.\//, '');
|
|
444
|
-
const fileVariants = pathVariants(filePath);
|
|
445
|
-
const sandboxVariants = pathVariants(sandbox);
|
|
446
|
-
for (const candidate of fileVariants) {
|
|
447
|
-
for (const root of sandboxVariants) {
|
|
448
|
-
const rel = relativeIfUnder(candidate, root);
|
|
449
|
-
if (rel) return rel;
|
|
450
|
-
}
|
|
451
|
-
}
|
|
452
|
-
return filePath;
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
function extractPatchFilePaths(patchText) {
|
|
456
|
-
if (!patchText || typeof patchText !== 'string') return [];
|
|
457
|
-
const files = [];
|
|
458
|
-
for (const line of patchText.split('\n')) {
|
|
459
|
-
const match = line.match(/^\*\*\* (?:Add|Update|Delete) File:\s+(.+?)\s*$/);
|
|
460
|
-
if (match?.[1]) files.push(match[1].trim());
|
|
461
|
-
const moveMatch = line.match(/^\*\*\* Move to:\s+(.+?)\s*$/);
|
|
462
|
-
if (moveMatch?.[1]) files.push(moveMatch[1].trim());
|
|
463
|
-
}
|
|
464
|
-
return [...new Set(files)];
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
function pathVariants(filePath) {
|
|
468
|
-
const variants = new Set([filePath]);
|
|
469
|
-
try { variants.add(path.resolve(filePath)); } catch {}
|
|
470
|
-
try { variants.add(fs.realpathSync(filePath)); } catch {}
|
|
471
|
-
for (const value of [...variants]) {
|
|
472
|
-
if (value.startsWith('/private/tmp/')) variants.add(value.replace(/^\/private\/tmp\//, '/tmp/'));
|
|
473
|
-
if (value.startsWith('/tmp/')) variants.add(value.replace(/^\/tmp\//, '/private/tmp/'));
|
|
474
|
-
}
|
|
475
|
-
return [...variants].filter(Boolean);
|
|
476
|
-
}
|
|
477
|
-
|
|
478
|
-
function relativeIfUnder(filePath, root) {
|
|
479
|
-
if (!filePath || !root || !path.isAbsolute(filePath) || !path.isAbsolute(root)) return null;
|
|
480
|
-
const rel = path.relative(root, filePath);
|
|
481
|
-
if (!rel || rel.startsWith('..') || path.isAbsolute(rel)) return null;
|
|
482
|
-
return rel.replace(/\\/g, '/');
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
function computeSandboxDiff(sandbox) {
|
|
486
|
-
try {
|
|
487
|
-
return execFileSync('git', ['diff', 'HEAD'], {
|
|
488
|
-
cwd: sandbox, encoding: 'utf8', timeout: 15000, maxBuffer: 10 * 1024 * 1024,
|
|
489
|
-
});
|
|
490
|
-
} catch {
|
|
491
|
-
return '';
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
function summarizeAgentResult(agentResult) {
|
|
496
|
-
if (!agentResult || typeof agentResult !== 'object') return null;
|
|
497
|
-
return {
|
|
498
|
-
success: agentResult.success,
|
|
499
|
-
exitCode: agentResult.exitCode,
|
|
500
|
-
stderr: agentResult.stderr ? String(agentResult.stderr).slice(0, 1000) : '',
|
|
501
|
-
output: agentResult.output ? String(agentResult.output).slice(0, 2000) : '',
|
|
502
|
-
logLength: Array.isArray(agentResult.log) ? agentResult.log.length : 0,
|
|
503
|
-
sessionId: agentResult.sessionId || null,
|
|
504
|
-
};
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
/**
|
|
508
|
-
* Best-effort: find the commit just AFTER session end and use its diff
|
|
509
|
-
* as Claude's ground-truth output. If no commit landed, return null.
|
|
510
|
-
*/
|
|
511
|
-
function computeClaudeDiff(session) {
|
|
512
|
-
try {
|
|
513
|
-
const commits = findCommitsInWindow(session.cwd, session.tsStart, session.tsEnd);
|
|
514
|
-
if (!commits.length) return null;
|
|
515
|
-
return getCommitDiff(session.cwd, commits[0].hash) || null;
|
|
516
|
-
} catch {
|
|
517
|
-
return null;
|
|
518
|
-
}
|
|
519
|
-
}
|
|
520
|
-
|
|
521
|
-
/**
|
|
522
|
-
* LLM-as-judge: ask Claude Haiku whether wall-e's diff achieves the
|
|
523
|
-
* same intent as Claude's diff. Returns {score, reason}.
|
|
524
|
-
*/
|
|
525
|
-
async function scoreLlmJudge({ prompt, walleDiff, claudeDiff, model }) {
|
|
526
|
-
const { getDefaultClient } = require('../llm/client');
|
|
527
|
-
const client = getDefaultClient();
|
|
528
|
-
const judgePrompt = [
|
|
529
|
-
`User's request: ${(prompt || '').slice(0, 1000)}`,
|
|
530
|
-
'',
|
|
531
|
-
`--- Diff A (Claude Code's actual output, ground truth) ---`,
|
|
532
|
-
(claudeDiff || '<no commit landed for this session>').slice(0, 4000),
|
|
533
|
-
'',
|
|
534
|
-
`--- Diff B (wall-e's output) ---`,
|
|
535
|
-
(walleDiff || '<empty>').slice(0, 4000),
|
|
536
|
-
'',
|
|
537
|
-
'Score 0.0 to 1.0: how well does Diff B accomplish the same intent as Diff A',
|
|
538
|
-
'for the user\'s request? Consider goal achievement, not byte equality. If both',
|
|
539
|
-
'diffs are empty/missing, score 0.5. Reply with ONLY a single line:',
|
|
540
|
-
'SCORE: <0.0–1.0>',
|
|
541
|
-
'REASON: <one sentence>',
|
|
542
|
-
].join('\n');
|
|
543
|
-
|
|
544
|
-
const resp = await client.chat({
|
|
545
|
-
model: model || 'claude-haiku-4-5-20251001',
|
|
546
|
-
messages: [{ role: 'user', content: judgePrompt }],
|
|
547
|
-
maxTokens: 200,
|
|
548
|
-
});
|
|
549
|
-
const text = (resp.content || '').trim();
|
|
550
|
-
const scoreMatch = text.match(/SCORE:\s*([0-9]*\.?[0-9]+)/i);
|
|
551
|
-
const reasonMatch = text.match(/REASON:\s*(.+)/i);
|
|
552
|
-
let score = scoreMatch ? parseFloat(scoreMatch[1]) : 0.5;
|
|
553
|
-
if (!isFinite(score) || score < 0) score = 0;
|
|
554
|
-
if (score > 1) score = 1;
|
|
555
|
-
return { score, reason: reasonMatch ? reasonMatch[1].trim() : null };
|
|
556
|
-
}
|
|
557
|
-
|
|
558
|
-
// ---------------------------------------------------------------------------
|
|
559
|
-
// Reap mode — harvest sessions into the persistent benchmark catalog
|
|
560
|
-
// ---------------------------------------------------------------------------
|
|
561
|
-
|
|
562
|
-
const crypto = require('crypto');
|
|
563
|
-
const CODING_AGENT_BENCHMARKS_PATH = path.join(__dirname, 'benchmarks', 'coding-agent.json');
|
|
564
|
-
|
|
565
|
-
/**
|
|
566
|
-
* Convert a picked session into a self-contained benchmark catalog entry.
|
|
567
|
-
* The entry embeds everything needed to replay later (userMessages, recovered
|
|
568
|
-
* commit, claudeTools, claudeFiles) so the original JSONL can be deleted
|
|
569
|
-
* without breaking the entry.
|
|
570
|
-
*/
|
|
571
|
-
function sessionToCatalogEntry(session, recoveredCommit) {
|
|
572
|
-
const sha = crypto.createHash('sha256').update(session.sessionId + '|' + (session.userMessages[0] || '')).digest('hex').slice(0, 8);
|
|
573
|
-
const id = `agent-cc-${sha}`;
|
|
574
|
-
const tools = session.claudeToolCalls || [];
|
|
575
|
-
const traits = ['has code block'];
|
|
576
|
-
if (tools.some((t) => /read_file|glob|grep|Read|Grep/.test(t))) traits.push('reads before writing');
|
|
577
|
-
if (tools.some((t) => /edit_file|Edit|str_replace/.test(t))) traits.push('uses edit over write');
|
|
578
|
-
if (tools.some((t) => /bash|Bash/.test(t))) traits.push('runs commands');
|
|
579
|
-
|
|
580
|
-
const turnCount = session.turnCount || 1;
|
|
581
|
-
const difficulty = turnCount > 10 ? 'hard' : turnCount > 4 ? 'medium' : 'easy';
|
|
582
|
-
|
|
583
|
-
return {
|
|
584
|
-
id,
|
|
585
|
-
prompt: session.userMessages[0] || '',
|
|
586
|
-
taskType: 'coding-agent',
|
|
587
|
-
difficulty,
|
|
588
|
-
expectedTraits: traits,
|
|
589
|
-
agentExpectations: {
|
|
590
|
-
expectedToolCalls: [...new Set(tools)].slice(0, 12),
|
|
591
|
-
maxTurns: Math.min(turnCount * 2, 50),
|
|
592
|
-
expectedFileChanges: session.claudeFilesEdited || [],
|
|
593
|
-
},
|
|
594
|
-
ccReplay: {
|
|
595
|
-
sourceSessionId: session.sessionId,
|
|
596
|
-
cwd: session.cwd,
|
|
597
|
-
gitBranch: session.gitBranch || null,
|
|
598
|
-
tsStart: session.tsStart,
|
|
599
|
-
tsEnd: session.tsEnd,
|
|
600
|
-
recoveredCommit,
|
|
601
|
-
userMessages: session.userMessages,
|
|
602
|
-
turnCount,
|
|
603
|
-
claudeToolCalls: tools,
|
|
604
|
-
claudeFilesEdited: session.claudeFilesEdited || [],
|
|
605
|
-
reapedAt: new Date().toISOString(),
|
|
606
|
-
},
|
|
607
|
-
};
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
/**
|
|
611
|
-
* Harvest fresh Claude Code sessions and append them to the benchmark
|
|
612
|
-
* catalog (coding-agent.json). Dedups against existing `agent-cc-*` and
|
|
613
|
-
* `agent-session-*` IDs. Returns the entries that were added.
|
|
614
|
-
*
|
|
615
|
-
* @param {object} opts
|
|
616
|
-
* @param {string} opts.repoPath
|
|
617
|
-
* @param {number} [opts.limit] — max NEW entries to add this run
|
|
618
|
-
* @param {number} [opts.sinceDays] — only consider JSONLs with this mtime window
|
|
619
|
-
* @param {string} [opts.benchmarksPath] — override for tests
|
|
620
|
-
* @returns {{added: object[], skipped: number, total: number}}
|
|
621
|
-
*/
|
|
622
|
-
function reapSessions({ repoPath, limit = 5, sinceDays = 14, benchmarksPath = CODING_AGENT_BENCHMARKS_PATH } = {}) {
|
|
623
|
-
if (!repoPath) throw new Error('repoPath is required');
|
|
624
|
-
|
|
625
|
-
// Load existing IDs for dedup
|
|
626
|
-
let existing = [];
|
|
627
|
-
try { existing = JSON.parse(fs.readFileSync(benchmarksPath, 'utf8')); }
|
|
628
|
-
catch { existing = []; }
|
|
629
|
-
const existingIds = new Set(existing.map((e) => e.id));
|
|
630
|
-
const existingSourceIds = new Set(
|
|
631
|
-
existing.filter((e) => e.ccReplay && e.ccReplay.sourceSessionId).map((e) => e.ccReplay.sourceSessionId)
|
|
632
|
-
);
|
|
633
|
-
|
|
634
|
-
// Pick more candidates than `limit` so we can skip non-replayable ones
|
|
635
|
-
const candidates = pickRecentSessions({ repoPath, sinceDays, limit: limit * 4 });
|
|
636
|
-
|
|
637
|
-
const added = [];
|
|
638
|
-
let skipped = 0;
|
|
639
|
-
for (const session of candidates) {
|
|
640
|
-
if (added.length >= limit) break;
|
|
641
|
-
|
|
642
|
-
if (existingSourceIds.has(session.sessionId)) { skipped++; continue; }
|
|
643
|
-
|
|
644
|
-
const repo = recoverRepoState(session.cwd, session.tsStart);
|
|
645
|
-
if (!repo) { skipped++; continue; } // can't reconstruct repo state
|
|
646
|
-
|
|
647
|
-
const entry = sessionToCatalogEntry(session, repo.commitSha);
|
|
648
|
-
if (existingIds.has(entry.id)) { skipped++; continue; }
|
|
649
|
-
|
|
650
|
-
existing.push(entry);
|
|
651
|
-
existingIds.add(entry.id);
|
|
652
|
-
existingSourceIds.add(session.sessionId);
|
|
653
|
-
added.push(entry);
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
if (added.length > 0) {
|
|
657
|
-
fs.writeFileSync(benchmarksPath, JSON.stringify(existing, null, 2) + '\n');
|
|
658
|
-
}
|
|
659
|
-
|
|
660
|
-
return { added, skipped, total: existing.length };
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
/**
|
|
664
|
-
* Replay a catalog entry that has an embedded ccReplay block. No live JSONL
|
|
665
|
-
* required — everything needed (userMessages, claudeTools, claudeFiles,
|
|
666
|
-
* recoveredCommit) is in the entry itself. Used by the standard
|
|
667
|
-
* /eval/coding-agent/run path when bench.id starts with `agent-cc-`.
|
|
668
|
-
*
|
|
669
|
-
* @param {object} bench — catalog entry with bench.ccReplay
|
|
670
|
-
* @param {Function} runAgentLoop
|
|
671
|
-
* @param {object} [opts]
|
|
672
|
-
* @returns {Promise<object>} same shape as replayAndScore
|
|
673
|
-
*/
|
|
674
|
-
async function replayFromCatalog(bench, runAgentLoop, opts = {}) {
|
|
675
|
-
if (!bench.ccReplay) throw new Error(`Benchmark ${bench.id} has no ccReplay block`);
|
|
676
|
-
const cc = bench.ccReplay;
|
|
677
|
-
const session = {
|
|
678
|
-
sessionId: cc.sourceSessionId,
|
|
679
|
-
cwd: cc.cwd,
|
|
680
|
-
gitBranch: cc.gitBranch,
|
|
681
|
-
userMessages: cc.userMessages || [bench.prompt],
|
|
682
|
-
tsStart: cc.tsStart,
|
|
683
|
-
tsEnd: cc.tsEnd,
|
|
684
|
-
turnCount: cc.turnCount || (cc.userMessages || [bench.prompt]).length,
|
|
685
|
-
claudeToolCalls: cc.claudeToolCalls || [],
|
|
686
|
-
claudeFilesEdited: cc.claudeFilesEdited || [],
|
|
687
|
-
};
|
|
688
|
-
|
|
689
|
-
let sandbox = null;
|
|
690
|
-
try {
|
|
691
|
-
sandbox = makeReplaySandbox(cc.cwd, cc.recoveredCommit, session.sessionId);
|
|
692
|
-
return await replayAndScore(session, sandbox, runAgentLoop, opts);
|
|
693
|
-
} finally {
|
|
694
|
-
if (sandbox) {
|
|
695
|
-
try { removeReplaySandbox(sandbox, cc.cwd); }
|
|
696
|
-
catch (e) { console.warn('[cc-replay] catalog sandbox cleanup failed:', e.message); }
|
|
697
|
-
}
|
|
698
|
-
}
|
|
699
|
-
}
|
|
700
|
-
|
|
701
|
-
module.exports = {
|
|
702
|
-
pickRecentSessions,
|
|
703
|
-
recoverRepoState,
|
|
704
|
-
makeReplaySandbox,
|
|
705
|
-
removeReplaySandbox,
|
|
706
|
-
gitWorktreeCleanup,
|
|
707
|
-
replayAndScore,
|
|
708
|
-
reapSessions,
|
|
709
|
-
replayFromCatalog,
|
|
710
|
-
sessionToCatalogEntry,
|
|
711
|
-
// exposed for tests
|
|
712
|
-
parseSessionJsonl,
|
|
713
|
-
jaccard,
|
|
714
|
-
canonicalReplayFilePath,
|
|
715
|
-
canonicalToolName,
|
|
716
|
-
canonicalizeToolList,
|
|
717
|
-
extractWalleFiles,
|
|
718
|
-
extractPatchFilePaths,
|
|
719
|
-
};
|