create-walle 0.9.21 → 0.9.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -5
- package/package.json +2 -2
- package/template/CLAUDE.md +2 -2
- package/template/LICENSE +1 -1
- package/template/bin/ctm-dev-cleanup.js +24 -3
- package/template/bin/ctm-launch.sh +13 -0
- package/template/bin/dev.sh +156 -18
- package/template/bin/node-bin.sh +84 -0
- package/template/bin/pin-node.sh +51 -0
- package/template/claude-task-manager/api-prompts.js +1203 -182
- package/template/claude-task-manager/api-reviews.js +109 -15
- package/template/claude-task-manager/approval-agent.js +1360 -280
- package/template/claude-task-manager/bin/restart-ctm.sh +64 -23
- package/template/claude-task-manager/bin/storage-migration-supervisor.js +338 -0
- package/template/claude-task-manager/db.js +4417 -295
- package/template/claude-task-manager/docs/app-update-refresh-protocol.md +69 -0
- package/template/claude-task-manager/docs/approval-ai-refinement.md +138 -0
- package/template/claude-task-manager/docs/approval-rescue-loop.md +74 -0
- package/template/claude-task-manager/docs/codex-operational-warning-health.md +107 -0
- package/template/claude-task-manager/docs/codex-resume-state-guard-design.md +17 -12
- package/template/claude-task-manager/docs/codex-terminal-render-controller-handoff.md +311 -0
- package/template/claude-task-manager/docs/coding-agent-hooks-architecture.md +418 -0
- package/template/claude-task-manager/docs/conversation-import-freshness.md +20 -0
- package/template/claude-task-manager/docs/google-workspace-auth-health.md +77 -0
- package/template/claude-task-manager/docs/image-paste-ux.md +13 -0
- package/template/claude-task-manager/docs/ipad-web-preview.md +88 -0
- package/template/claude-task-manager/docs/main-loop-offload-architecture.md +66 -0
- package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +274 -519
- package/template/claude-task-manager/docs/mobile-live-streaming.md +27 -5
- package/template/claude-task-manager/docs/mobile-remote-submission-lifecycle.md +69 -0
- package/template/claude-task-manager/docs/phone-access-design.md +53 -15
- package/template/claude-task-manager/docs/phone-passkey-identity.md +122 -0
- package/template/claude-task-manager/docs/phone-setup.md +3 -0
- package/template/claude-task-manager/docs/prompt-editing-tree-design.md +25 -1
- package/template/claude-task-manager/docs/remote-desktop-access-design.md +268 -0
- package/template/claude-task-manager/docs/restart-lifecycle-architecture.md +95 -0
- package/template/claude-task-manager/docs/runtime-work-control-plane.md +53 -0
- package/template/claude-task-manager/docs/session-interactive-wait-surfaces.md +38 -0
- package/template/claude-task-manager/docs/session-needs-you-dismissal.md +84 -0
- package/template/claude-task-manager/docs/session-render-state-management-design.md +91 -3
- package/template/claude-task-manager/docs/session-standup-command-center-design.md +25 -1
- package/template/claude-task-manager/docs/session-title-authority.md +32 -0
- package/template/claude-task-manager/docs/session-workspace-binding.md +33 -0
- package/template/claude-task-manager/docs/skill-intent-resolution-design.md +72 -0
- package/template/claude-task-manager/docs/walle-mcp-supervisor-health.md +86 -0
- package/template/claude-task-manager/docs/walle-relay-phone-access-design.md +24 -15
- package/template/claude-task-manager/docs/walle-session-history-hydration.md +114 -0
- package/template/claude-task-manager/docs/walle-session-input-queue.md +104 -0
- package/template/claude-task-manager/docs/walle-session-model-catalog.md +90 -0
- package/template/claude-task-manager/docs/walle-session-model-preferences.md +15 -6
- package/template/claude-task-manager/git-utils.js +897 -27
- package/template/claude-task-manager/lib/agent-capabilities.js +33 -0
- package/template/claude-task-manager/lib/agent-cli-cache.js +37 -7
- package/template/claude-task-manager/lib/agent-hooks-installer.js +26 -2
- package/template/claude-task-manager/lib/agent-presets.js +17 -1
- package/template/claude-task-manager/lib/all-sessions-query.js +108 -0
- package/template/claude-task-manager/lib/approval-ai-refinement.js +488 -0
- package/template/claude-task-manager/lib/approval-self-adapt.js +168 -0
- package/template/claude-task-manager/lib/async-semaphore.js +44 -0
- package/template/claude-task-manager/lib/auth-context.js +5 -0
- package/template/claude-task-manager/lib/auth-rate-limit.js +47 -4
- package/template/claude-task-manager/lib/auth-rules.js +29 -2
- package/template/claude-task-manager/lib/auto-approval-verifier.js +129 -16
- package/template/claude-task-manager/lib/background-llm.js +144 -17
- package/template/claude-task-manager/lib/branch-inventory.js +212 -0
- package/template/claude-task-manager/lib/claude-desktop-sessions.js +15 -3
- package/template/claude-task-manager/lib/coalesce-sync-frames.js +151 -0
- package/template/claude-task-manager/lib/codex-launch-health.js +762 -0
- package/template/claude-task-manager/lib/codex-transcript-pager.js +51 -0
- package/template/claude-task-manager/lib/codex-zst.js +124 -0
- package/template/claude-task-manager/lib/coding-agent-models.js +233 -30
- package/template/claude-task-manager/lib/connection-health.js +232 -0
- package/template/claude-task-manager/lib/conversation-blob-parser.js +42 -0
- package/template/claude-task-manager/lib/conversation-tail-merge.js +89 -26
- package/template/claude-task-manager/lib/ctm-session-context-api.js +39 -10
- package/template/claude-task-manager/lib/cursor-conversation-store.js +354 -0
- package/template/claude-task-manager/lib/db-owner-worker-client.js +315 -0
- package/template/claude-task-manager/lib/document-review.js +141 -6
- package/template/claude-task-manager/lib/escalation-review.js +152 -0
- package/template/claude-task-manager/lib/graceful-shutdown.js +159 -0
- package/template/claude-task-manager/lib/headless-term-service.js +678 -0
- package/template/claude-task-manager/lib/heavy-worker-fallback.js +38 -0
- package/template/claude-task-manager/lib/jsonl-conversation-parser.js +542 -0
- package/template/claude-task-manager/lib/jsonl-range-reader.js +112 -0
- package/template/claude-task-manager/lib/main-db-census.js +216 -0
- package/template/claude-task-manager/lib/message-pagination.js +106 -4
- package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +750 -26
- package/template/claude-task-manager/lib/mobile-auth-api.js +274 -7
- package/template/claude-task-manager/lib/mobile-auth-store.js +592 -10
- package/template/claude-task-manager/lib/mobile-notification-dispatcher.js +15 -0
- package/template/claude-task-manager/lib/model-overview-brain-fallback.js +311 -0
- package/template/claude-task-manager/lib/model-overview-cache.js +141 -0
- package/template/claude-task-manager/lib/models-health-routing-notice.js +126 -0
- package/template/claude-task-manager/lib/node-pin-guard.js +93 -0
- package/template/claude-task-manager/lib/perf-tracker.js +242 -6
- package/template/claude-task-manager/lib/permission-match.js +76 -0
- package/template/claude-task-manager/lib/permission-sync.js +133 -20
- package/template/claude-task-manager/lib/process-title.js +35 -0
- package/template/claude-task-manager/lib/prompt-executions-query.js +25 -0
- package/template/claude-task-manager/lib/prompt-index-disk-cache.js +44 -0
- package/template/claude-task-manager/lib/prompt-intent.js +132 -0
- package/template/claude-task-manager/lib/provider-user-context.js +34 -0
- package/template/claude-task-manager/lib/read-pool-client.js +313 -0
- package/template/claude-task-manager/lib/readpool-breaker.js +31 -0
- package/template/claude-task-manager/lib/recent-sessions-breaker.js +12 -0
- package/template/claude-task-manager/lib/remote-feedback-client.js +72 -0
- package/template/claude-task-manager/lib/remote-relay-protocol.js +37 -4
- package/template/claude-task-manager/lib/remote-relay-store.js +159 -0
- package/template/claude-task-manager/lib/remote-submission-observer.js +278 -0
- package/template/claude-task-manager/lib/restart-guard.js +109 -0
- package/template/claude-task-manager/lib/restore-interruption-detector.js +439 -0
- package/template/claude-task-manager/lib/restore-policy.js +13 -0
- package/template/claude-task-manager/lib/restore-resume-batch.js +74 -0
- package/template/claude-task-manager/lib/restore-runtime.js +68 -0
- package/template/claude-task-manager/lib/restore-storm.js +34 -0
- package/template/claude-task-manager/lib/resume-cwd.js +36 -0
- package/template/claude-task-manager/lib/resume-preflight.js +313 -0
- package/template/claude-task-manager/lib/runtime-work-registry.js +444 -0
- package/template/claude-task-manager/lib/sanitize-openai-auth.js +31 -0
- package/template/claude-task-manager/lib/scheduler.js +21 -1
- package/template/claude-task-manager/lib/scrollback-snapshot-store.js +159 -0
- package/template/claude-task-manager/lib/serial-task-queue.js +64 -0
- package/template/claude-task-manager/lib/server-listeners.js +239 -0
- package/template/claude-task-manager/lib/session-capture.js +42 -7
- package/template/claude-task-manager/lib/session-content-backfill.js +131 -0
- package/template/claude-task-manager/lib/session-history.js +388 -43
- package/template/claude-task-manager/lib/session-host-manager.js +287 -0
- package/template/claude-task-manager/lib/session-image-refs.js +209 -0
- package/template/claude-task-manager/lib/session-jobs.js +399 -59
- package/template/claude-task-manager/lib/session-prompt-index.js +137 -0
- package/template/claude-task-manager/lib/session-restore.js +53 -0
- package/template/claude-task-manager/lib/session-standup.js +123 -23
- package/template/claude-task-manager/lib/session-state-bus.js +14 -0
- package/template/claude-task-manager/lib/session-stream.js +64 -16
- package/template/claude-task-manager/lib/session-timeline-summary.js +260 -0
- package/template/claude-task-manager/lib/session-token-usage.js +494 -0
- package/template/claude-task-manager/lib/session-workspace-binding.js +356 -0
- package/template/claude-task-manager/lib/setup-network-config.js +9 -0
- package/template/claude-task-manager/lib/size-cap.js +45 -0
- package/template/claude-task-manager/lib/size-cap.test.js +62 -0
- package/template/claude-task-manager/lib/skill-autocomplete.js +180 -1
- package/template/claude-task-manager/lib/skill-intent-resolver.js +304 -0
- package/template/claude-task-manager/lib/sqlite-driver.js +19 -3
- package/template/claude-task-manager/lib/standup-attention.js +7 -3
- package/template/claude-task-manager/lib/status-authority.js +39 -0
- package/template/claude-task-manager/lib/status-hooks.js +4 -0
- package/template/claude-task-manager/lib/storage-migration.js +235 -0
- package/template/claude-task-manager/lib/structured-capture.js +298 -0
- package/template/claude-task-manager/lib/sync-io-census.js +163 -0
- package/template/claude-task-manager/lib/tailscale-setup.js +6 -0
- package/template/claude-task-manager/lib/terminal-activity-evidence.js +33 -0
- package/template/claude-task-manager/lib/terminal-choice.js +364 -0
- package/template/claude-task-manager/lib/terminal-control-sanitize.js +17 -0
- package/template/claude-task-manager/lib/terminal-fingerprint.js +48 -0
- package/template/claude-task-manager/lib/terminal-output-flush.js +84 -0
- package/template/claude-task-manager/lib/timeline-order.js +122 -0
- package/template/claude-task-manager/lib/transcript-store.js +348 -43
- package/template/claude-task-manager/lib/transport-security.js +84 -1
- package/template/claude-task-manager/lib/wait-state.js +184 -0
- package/template/claude-task-manager/lib/walle-client.js +47 -5
- package/template/claude-task-manager/lib/walle-ctm-history.js +564 -4
- package/template/claude-task-manager/lib/walle-external-actions.js +135 -16
- package/template/claude-task-manager/lib/walle-history-hydration.js +46 -0
- package/template/claude-task-manager/lib/walle-native-health.js +403 -0
- package/template/claude-task-manager/lib/walle-repair.js +701 -0
- package/template/claude-task-manager/lib/walle-session-cache.js +109 -0
- package/template/claude-task-manager/lib/walle-session-context.js +57 -21
- package/template/claude-task-manager/lib/walle-session-model-catalog.js +34 -0
- package/template/claude-task-manager/lib/walle-supervisor.js +539 -63
- package/template/claude-task-manager/lib/walle-transcript.js +52 -0
- package/template/claude-task-manager/lib/worktree-active-sync.js +11 -7
- package/template/claude-task-manager/lib/worktree-cwd.js +32 -1
- package/template/claude-task-manager/package.json +1 -1
- package/template/claude-task-manager/prompt-harvest.js +89 -66
- package/template/claude-task-manager/providers/claude-code.js +51 -3
- package/template/claude-task-manager/providers/cursor.js +140 -45
- package/template/claude-task-manager/public/css/reviews.css +551 -61
- package/template/claude-task-manager/public/css/setup.css +191 -0
- package/template/claude-task-manager/public/css/walle-session.css +865 -10
- package/template/claude-task-manager/public/css/walle.css +154 -0
- package/template/claude-task-manager/public/designs/ai-providers-consolidation-v2.html +830 -0
- package/template/claude-task-manager/public/index.html +18516 -2058
- package/template/claude-task-manager/public/ipad.html +363 -0
- package/template/claude-task-manager/public/js/document-review-links.js +301 -0
- package/template/claude-task-manager/public/js/image-normalize.js +69 -36
- package/template/claude-task-manager/public/js/message-renderer.js +1265 -77
- package/template/claude-task-manager/public/js/prompts.js +66 -29
- package/template/claude-task-manager/public/js/reviews.js +901 -133
- package/template/claude-task-manager/public/js/session-activity-utils.js +11 -1
- package/template/claude-task-manager/public/js/session-search-utils.js +94 -10
- package/template/claude-task-manager/public/js/session-status-precedence.js +23 -5
- package/template/claude-task-manager/public/js/setup.js +1273 -176
- package/template/claude-task-manager/public/js/stream-view.js +691 -73
- package/template/claude-task-manager/public/js/terminal-reconciler.js +210 -0
- package/template/claude-task-manager/public/js/walle-session.js +2455 -158
- package/template/claude-task-manager/public/js/walle.js +455 -28
- package/template/claude-task-manager/public/m/app.css +2909 -262
- package/template/claude-task-manager/public/m/app.js +6601 -398
- package/template/claude-task-manager/public/m/claim.html +224 -17
- package/template/claude-task-manager/public/m/index.html +117 -21
- package/template/claude-task-manager/public/m/sw.js +3 -1
- package/template/claude-task-manager/public/manifest.json +2 -2
- package/template/claude-task-manager/public/prompts.html +30 -14
- package/template/claude-task-manager/queue-engine.js +507 -28
- package/template/claude-task-manager/scripts/repair-claude-session-images.js +27 -8
- package/template/claude-task-manager/server.js +14341 -2197
- package/template/claude-task-manager/session-integrity.js +160 -18
- package/template/claude-task-manager/session-search-ranking.js +1 -0
- package/template/claude-task-manager/session-utils.js +25 -5
- package/template/claude-task-manager/workers/approval-blocklist.js +96 -6
- package/template/claude-task-manager/workers/approval-widget-validator.js +14 -8
- package/template/claude-task-manager/workers/conversation-import-worker.js +11 -50
- package/template/claude-task-manager/workers/db-owner-worker.js +386 -0
- package/template/claude-task-manager/workers/harvest-worker.js +9 -55
- package/template/claude-task-manager/workers/headless-term-worker.js +9 -530
- package/template/claude-task-manager/workers/read-pool-worker.js +387 -0
- package/template/claude-task-manager/workers/scrollback-worker.js +11 -72
- package/template/claude-task-manager/workers/session-host-process.js +146 -0
- package/template/claude-task-manager/workers/session-integrity-worker.js +10 -54
- package/template/claude-task-manager/workers/state-detectors/base.js +18 -1
- package/template/claude-task-manager/workers/state-detectors/claude-code.js +182 -9
- package/template/claude-task-manager/workers/state-detectors/codex.js +150 -2
- package/template/claude-task-manager/workers/state-detectors/cursor.js +127 -0
- package/template/claude-task-manager/workers/state-detectors/gemini.js +21 -0
- package/template/claude-task-manager/workers/state-detectors/index.js +29 -0
- package/template/claude-task-manager/workers/state-detectors/opencode.js +103 -0
- package/template/docs/design/markdown-review-pane.md +206 -0
- package/template/docs/designs/2026-05-17-portkey-gateway-provider-ux.md +129 -38
- package/template/docs/designs/2026-05-20-mobile-worktree-finish-command.md +27 -0
- package/template/docs/designs/2026-05-22-ai-configuration-consolidation.md +248 -0
- package/template/docs/designs/ai-configuration-consolidation-mock.html +812 -0
- package/template/docs/private-memory-and-pii-policy.md +69 -0
- package/template/package.json +2 -1
- package/template/scripts/check-private-data.js +201 -0
- package/template/shared/sqlite-owner-guard.js +30 -0
- package/template/shared/sqlite-owner-write-queue.js +225 -0
- package/template/shared/sqlite-storage-policy.js +111 -0
- package/template/shared/sqlite-write-lock.js +428 -0
- package/template/wall-e/agent-runners/claude-code.js +5 -0
- package/template/wall-e/agent.js +166 -22
- package/template/wall-e/api-walle.js +524 -70
- package/template/wall-e/auth/provider-flows.js +11 -1
- package/template/wall-e/bin/walle-mcp-stdio.js +341 -17
- package/template/wall-e/brain.js +1614 -141
- package/template/wall-e/chat/attachment-blocks.js +96 -0
- package/template/wall-e/chat/attachments.js +2 -1
- package/template/wall-e/chat/capability-resolver.js +7 -7
- package/template/wall-e/chat/context-messages.js +28 -0
- package/template/wall-e/chat/conversation-frame.js +630 -0
- package/template/wall-e/chat/provider-messages.js +125 -0
- package/template/wall-e/chat.js +1002 -233
- package/template/wall-e/coding/acceptance-contract.js +170 -0
- package/template/wall-e/coding/acp-adapter.js +1 -1
- package/template/wall-e/coding/agent-catalog.js +3 -0
- package/template/wall-e/coding/artifact-store.js +93 -0
- package/template/wall-e/coding/capability-router.js +120 -0
- package/template/wall-e/coding/coding-run-controller.js +423 -0
- package/template/wall-e/coding/compaction-service.js +157 -12
- package/template/wall-e/coding/frontend-verification.js +258 -0
- package/template/wall-e/coding/lifecycle-hooks.js +75 -0
- package/template/wall-e/coding/local-preview-contract.js +157 -0
- package/template/wall-e/coding/permission-service.js +57 -13
- package/template/wall-e/coding/prompt-bundle.js +19 -1
- package/template/wall-e/coding/prompt-section-registry.js +227 -0
- package/template/wall-e/coding/provider-compat.js +15 -0
- package/template/wall-e/coding/runtime-events.js +224 -0
- package/template/wall-e/coding/runtime-mode.js +3 -0
- package/template/wall-e/coding/side-git-snapshot.js +160 -4
- package/template/wall-e/coding/snapshot-service.js +143 -1
- package/template/wall-e/coding/stream-processor.js +388 -34
- package/template/wall-e/coding/task-tool.js +141 -4
- package/template/wall-e/coding/tool-execution-controller.js +365 -0
- package/template/wall-e/coding/tool-registry.js +43 -5
- package/template/wall-e/coding/user-hooks.js +217 -0
- package/template/wall-e/coding-orchestrator.js +1330 -221
- package/template/wall-e/coding-prompts.js +20 -4
- package/template/wall-e/context/context-builder.js +15 -2
- package/template/wall-e/decision/confidence.js +1 -1
- package/template/wall-e/docs/coding-acceptance-contract.md +41 -0
- package/template/wall-e/docs/external-action-controller.md +26 -6
- package/template/wall-e/docs/telemetry-lifecycle.md +8 -2
- package/template/wall-e/embeddings.js +591 -53
- package/template/wall-e/external-action-controller.js +12 -0
- package/template/wall-e/http/auth.js +1 -0
- package/template/wall-e/http/chat-api.js +46 -11
- package/template/wall-e/http/model-admin.js +836 -34
- package/template/wall-e/lib/boot-profile.js +88 -0
- package/template/wall-e/lib/event-loop-monitor.js +93 -0
- package/template/wall-e/lib/service-health.js +194 -0
- package/template/wall-e/llm/anthropic.js +130 -5
- package/template/wall-e/llm/client.js +266 -63
- package/template/wall-e/llm/default-fallback.js +382 -0
- package/template/wall-e/llm/health.js +19 -0
- package/template/wall-e/llm/message-guard.js +78 -0
- package/template/wall-e/llm/model-catalog.js +252 -1
- package/template/wall-e/llm/openai.js +26 -4
- package/template/wall-e/llm/portkey-sync.js +654 -0
- package/template/wall-e/llm/provider-error.js +30 -2
- package/template/wall-e/llm/registry.js +5 -1
- package/template/wall-e/llm/request-compat.js +67 -0
- package/template/wall-e/loops/backfill.js +79 -23
- package/template/wall-e/loops/brain-optimize.js +67 -0
- package/template/wall-e/loops/ingest.js +25 -10
- package/template/wall-e/loops/question-digest.js +160 -0
- package/template/wall-e/loops/reflect.js +6 -4
- package/template/wall-e/loops/think.js +39 -12
- package/template/wall-e/mcp-server.js +318 -36
- package/template/wall-e/memory/ctm-context-client.js +52 -14
- package/template/wall-e/memory/ctm-operational-context.js +237 -0
- package/template/wall-e/memory/ctm-prompt-executions-client.js +128 -0
- package/template/wall-e/memory/ctm-session-context.js +111 -63
- package/template/wall-e/prompts/coding/deepseek.txt +3 -0
- package/template/wall-e/prompts/coding/gemini.txt +6 -0
- package/template/wall-e/prompts/coding/gpt.txt +6 -0
- package/template/wall-e/prompts/coding/local.txt +7 -0
- package/template/wall-e/runtime/decision-hooks.js +115 -0
- package/template/wall-e/runtime/devbox-gateway.js +82 -8
- package/template/wall-e/runtime/prompt-manifest.js +86 -0
- package/template/wall-e/runtime/tool-executor.js +269 -0
- package/template/wall-e/runtime/tool-result-envelope.js +138 -0
- package/template/wall-e/runtime/transcript-projection.js +60 -0
- package/template/wall-e/runtime/walle-runtime.js +224 -0
- package/template/wall-e/scripts/db-optimize/migrate.js +162 -0
- package/template/wall-e/scripts/db-optimize/recall-eval.js +117 -0
- package/template/wall-e/server.js +15 -0
- package/template/wall-e/session-files.js +9 -0
- package/template/wall-e/skills/_bundled/google-calendar/run.js +1 -1
- package/template/wall-e/skills/_bundled/gws-workspace/run.js +1 -1
- package/template/wall-e/skills/_bundled/slack-mentions/run.js +76 -6
- package/template/wall-e/skills/claude-code-reader.js +7 -3
- package/template/wall-e/skills/script-skill-runner.js +10 -0
- package/template/wall-e/skills/skill-planner.js +38 -0
- package/template/wall-e/tools/builtin-middleware.js +19 -9
- package/template/wall-e/tools/local-tools.js +1428 -16
- package/template/wall-e/tools/permission-checker.js +73 -5
- package/template/wall-e/tools/question-manager.js +117 -7
- package/template/wall-e/training/harvester.js +12 -28
- package/template/wall-e/training/replay.js +25 -80
- package/template/website/index.html +10 -10
- package/template/wall-e/eval/ab-test.js +0 -203
- package/template/wall-e/eval/agent-runner.js +0 -772
- package/template/wall-e/eval/agent-scorer.js +0 -461
- package/template/wall-e/eval/aggregator.js +0 -414
- package/template/wall-e/eval/allowed-test-commands.js +0 -34
- package/template/wall-e/eval/benchmark-generator.js +0 -113
- package/template/wall-e/eval/benchmarks/chat-eval.json +0 -1662
- package/template/wall-e/eval/benchmarks/chat.json +0 -82
- package/template/wall-e/eval/benchmarks/coding-agent-real.json +0 -1
- package/template/wall-e/eval/benchmarks/coding-agent.json +0 -1581
- package/template/wall-e/eval/benchmarks/coding.json +0 -122
- package/template/wall-e/eval/benchmarks/memory-retrieval.json +0 -234
- package/template/wall-e/eval/benchmarks/reasoning.json +0 -82
- package/template/wall-e/eval/benchmarks/swebench-lite-30.json +0 -212
- package/template/wall-e/eval/benchmarks.js +0 -669
- package/template/wall-e/eval/cc-replay.js +0 -719
- package/template/wall-e/eval/chat-eval.js +0 -525
- package/template/wall-e/eval/check-keys.js +0 -15
- package/template/wall-e/eval/check-providers.js +0 -42
- package/template/wall-e/eval/codex-cli-baseline.js +0 -669
- package/template/wall-e/eval/coding-agent-real.js +0 -570
- package/template/wall-e/eval/context-compactor.js +0 -251
- package/template/wall-e/eval/debug-agent003.js +0 -68
- package/template/wall-e/eval/diagnostics.js +0 -216
- package/template/wall-e/eval/eval-orchestrator.js +0 -642
- package/template/wall-e/eval/evaluate.js +0 -202
- package/template/wall-e/eval/evaluator.js +0 -373
- package/template/wall-e/eval/exporter.js +0 -212
- package/template/wall-e/eval/fixtures/express-basic/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-basic/server.js +0 -115
- package/template/wall-e/eval/fixtures/express-basic/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy/server.js +0 -113
- package/template/wall-e/eval/fixtures/express-buggy/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy-items/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy-items/server.js +0 -112
- package/template/wall-e/eval/fixtures/express-buggy-items/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-buggy-search/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-buggy-search/server.js +0 -121
- package/template/wall-e/eval/fixtures/express-buggy-search/test.js +0 -83
- package/template/wall-e/eval/fixtures/express-rename-data/data.js +0 -34
- package/template/wall-e/eval/fixtures/express-rename-data/package.json +0 -9
- package/template/wall-e/eval/fixtures/express-rename-data/server.js +0 -97
- package/template/wall-e/eval/fixtures/express-rename-data/test.js +0 -88
- package/template/wall-e/eval/fixtures/express-xss/package.json +0 -12
- package/template/wall-e/eval/fixtures/express-xss/server.js +0 -90
- package/template/wall-e/eval/fixtures/express-xss/test.js +0 -67
- package/template/wall-e/eval/fixtures/express-xss/views/profile.ejs +0 -9
- package/template/wall-e/eval/fixtures/fullstack-app/config/default.js +0 -9
- package/template/wall-e/eval/fixtures/fullstack-app/config/test.js +0 -13
- package/template/wall-e/eval/fixtures/fullstack-app/package.json +0 -11
- package/template/wall-e/eval/fixtures/fullstack-app/public/css/style.css +0 -137
- package/template/wall-e/eval/fixtures/fullstack-app/public/index.html +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/app.js +0 -121
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/auth.js +0 -71
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/items.js +0 -80
- package/template/wall-e/eval/fixtures/fullstack-app/public/js/users.js +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/public/login.html +0 -45
- package/template/wall-e/eval/fixtures/fullstack-app/public/register.html +0 -38
- package/template/wall-e/eval/fixtures/fullstack-app/scripts/migrate.js +0 -23
- package/template/wall-e/eval/fixtures/fullstack-app/scripts/seed.js +0 -46
- package/template/wall-e/eval/fixtures/fullstack-app/server/db.js +0 -99
- package/template/wall-e/eval/fixtures/fullstack-app/server/index.js +0 -94
- package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/auth.js +0 -19
- package/template/wall-e/eval/fixtures/fullstack-app/server/middleware/logger.js +0 -19
- package/template/wall-e/eval/fixtures/fullstack-app/server/router.js +0 -50
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/auth.js +0 -69
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/health.js +0 -23
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/items.js +0 -88
- package/template/wall-e/eval/fixtures/fullstack-app/server/routes/users.js +0 -75
- package/template/wall-e/eval/fixtures/fullstack-app/server/test.js +0 -198
- package/template/wall-e/eval/fixtures/fullstack-app/server/utils/response.js +0 -34
- package/template/wall-e/eval/fixtures/fullstack-app/server/utils/validate.js +0 -26
- package/template/wall-e/eval/fixtures/fullstack-app/server.js +0 -8
- package/template/wall-e/eval/fixtures/fullstack-app/test.js +0 -12
- package/template/wall-e/eval/fixtures/monorepo-basic/package.json +0 -8
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/data.js +0 -58
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/middleware.js +0 -46
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/package.json +0 -8
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/routes.js +0 -64
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/server.js +0 -56
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/api/test.js +0 -116
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/commands.js +0 -61
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/index.js +0 -62
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/output.js +0 -43
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/package.json +0 -11
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/cli/test.js +0 -44
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/formatters.js +0 -43
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/index.js +0 -12
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/package.json +0 -5
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/test.js +0 -55
- package/template/wall-e/eval/fixtures/monorepo-basic/packages/shared/validators.js +0 -29
- package/template/wall-e/eval/fixtures/monorepo-basic/test.js +0 -46
- package/template/wall-e/eval/fixtures/node-cli/index.js +0 -78
- package/template/wall-e/eval/fixtures/node-cli/package.json +0 -10
- package/template/wall-e/eval/fixtures/node-cli/test.js +0 -57
- package/template/wall-e/eval/fixtures/node-typed/package.json +0 -8
- package/template/wall-e/eval/fixtures/node-typed/src/handlers.js +0 -31
- package/template/wall-e/eval/fixtures/node-typed/src/utils.js +0 -33
- package/template/wall-e/eval/fixtures/node-typed/test.js +0 -36
- package/template/wall-e/eval/fixtures/python-flask/app.py +0 -14
- package/template/wall-e/eval/fixtures/python-flask/requirements.txt +0 -2
- package/template/wall-e/eval/fixtures/python-flask/test_app.py +0 -25
- package/template/wall-e/eval/fixtures/wall-e-subset/brain.js +0 -105
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/aggregator.js +0 -101
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/chat.json +0 -20
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks/coding.json +0 -32
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/benchmarks.js +0 -64
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/package.json +0 -6
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/server.js +0 -31
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/test.js +0 -18
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/fixtures/simple-project/utils.js +0 -34
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/runner.js +0 -104
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/scorer.js +0 -73
- package/template/wall-e/eval/fixtures/wall-e-subset/eval/test.js +0 -134
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/client.js +0 -99
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/providers.js +0 -63
- package/template/wall-e/eval/fixtures/wall-e-subset/llm/test.js +0 -70
- package/template/wall-e/eval/fixtures/wall-e-subset/package.json +0 -10
- package/template/wall-e/eval/fixtures/wall-e-subset/test.js +0 -86
- package/template/wall-e/eval/harvester.js +0 -685
- package/template/wall-e/eval/head-to-head.js +0 -388
- package/template/wall-e/eval/humaneval-adapter.js +0 -321
- package/template/wall-e/eval/list-models.js +0 -31
- package/template/wall-e/eval/livecodebench-adapter.js +0 -291
- package/template/wall-e/eval/mail-integration.js +0 -443
- package/template/wall-e/eval/manifest.js +0 -186
- package/template/wall-e/eval/meta-harness/adapters/coding-agent.js +0 -57
- package/template/wall-e/eval/meta-harness/bootstrap-snapshot.js +0 -149
- package/template/wall-e/eval/meta-harness/candidate-store.js +0 -117
- package/template/wall-e/eval/meta-harness/cli.js +0 -86
- package/template/wall-e/eval/meta-harness/domain-spec.js +0 -154
- package/template/wall-e/eval/meta-harness/domains/coding-agent.domain.json +0 -84
- package/template/wall-e/eval/meta-harness/examples/env-bootstrap-candidate.js +0 -29
- package/template/wall-e/eval/meta-harness/experience-store.js +0 -174
- package/template/wall-e/eval/meta-harness/frontier.js +0 -96
- package/template/wall-e/eval/meta-harness/harness-interface.js +0 -90
- package/template/wall-e/eval/meta-harness/leakage-guard.js +0 -80
- package/template/wall-e/eval/meta-harness/optimizer.js +0 -207
- package/template/wall-e/eval/meta-harness/proposer-runner.js +0 -110
- package/template/wall-e/eval/meta-harness/reporting.js +0 -58
- package/template/wall-e/eval/meta-harness/telemetry.js +0 -27
- package/template/wall-e/eval/meta-harness/validation.js +0 -81
- package/template/wall-e/eval/promoter.js +0 -228
- package/template/wall-e/eval/provider-normalizer.js +0 -33
- package/template/wall-e/eval/replay.js +0 -395
- package/template/wall-e/eval/run-agent-benchmarks.js +0 -386
- package/template/wall-e/eval/run-codex-cli-baseline.js +0 -177
- package/template/wall-e/eval/run-coding-agent-real.js +0 -187
- package/template/wall-e/eval/run-eval.js +0 -435
- package/template/wall-e/eval/run-model-comparison.js +0 -142
- package/template/wall-e/eval/session-evaluator.js +0 -187
- package/template/wall-e/eval/session-miner.js +0 -207
- package/template/wall-e/eval/session-retrieval-benchmark.js +0 -150
- package/template/wall-e/eval/session-transcripts.js +0 -509
- package/template/wall-e/eval/shadow.js +0 -161
- package/template/wall-e/eval/swebench-adapter.js +0 -345
- package/template/wall-e/eval/swebench-docker.js +0 -192
- package/template/wall-e/eval/train.py +0 -320
- package/template/wall-e/eval/trainer.js +0 -232
- package/template/wall-e/eval/weekly-eval-loop.js +0 -241
|
@@ -1,1581 +0,0 @@
|
|
|
1
|
-
[
|
|
2
|
-
{
|
|
3
|
-
"id": "agent-001",
|
|
4
|
-
"prompt": "Add a /health endpoint to the Express app that returns { status: 'ok', uptime: process.uptime() }",
|
|
5
|
-
"taskType": "coding-agent",
|
|
6
|
-
"difficulty": "easy",
|
|
7
|
-
"expectedTraits": [
|
|
8
|
-
"has code block",
|
|
9
|
-
"reads before writing",
|
|
10
|
-
"uses edit over write"
|
|
11
|
-
],
|
|
12
|
-
"agentExpectations": {
|
|
13
|
-
"expectedToolCalls": [
|
|
14
|
-
"read_file",
|
|
15
|
-
"glob",
|
|
16
|
-
"edit_file"
|
|
17
|
-
],
|
|
18
|
-
"forbiddenToolCalls": [
|
|
19
|
-
"write_file"
|
|
20
|
-
],
|
|
21
|
-
"maxTurns": 8,
|
|
22
|
-
"expectedFileChanges": [
|
|
23
|
-
"server.js"
|
|
24
|
-
],
|
|
25
|
-
"testCommand": "npm test",
|
|
26
|
-
"projectFixture": "express-basic"
|
|
27
|
-
}
|
|
28
|
-
},
|
|
29
|
-
{
|
|
30
|
-
"id": "agent-002",
|
|
31
|
-
"prompt": "Add input validation to the POST /users endpoint: require 'name' (string, 1-100 chars) and 'email' (valid email format). Return 400 with descriptive error messages on invalid input.",
|
|
32
|
-
"taskType": "coding-agent",
|
|
33
|
-
"difficulty": "medium",
|
|
34
|
-
"expectedTraits": [
|
|
35
|
-
"has code block",
|
|
36
|
-
"reads before writing",
|
|
37
|
-
"uses edit over write",
|
|
38
|
-
"handles errors gracefully"
|
|
39
|
-
],
|
|
40
|
-
"agentExpectations": {
|
|
41
|
-
"expectedToolCalls": [
|
|
42
|
-
"read_file",
|
|
43
|
-
"glob",
|
|
44
|
-
"edit_file"
|
|
45
|
-
],
|
|
46
|
-
"forbiddenToolCalls": [],
|
|
47
|
-
"maxTurns": 10,
|
|
48
|
-
"expectedFileChanges": [
|
|
49
|
-
"server.js"
|
|
50
|
-
],
|
|
51
|
-
"testCommand": "npm test",
|
|
52
|
-
"projectFixture": "express-basic"
|
|
53
|
-
}
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
"id": "agent-003",
|
|
57
|
-
"prompt": "There's a bug in the GET /items/:id endpoint - it returns 200 with an empty body when the item doesn't exist instead of returning 404. Find and fix the bug, then verify with the existing tests.",
|
|
58
|
-
"taskType": "coding-agent",
|
|
59
|
-
"difficulty": "medium",
|
|
60
|
-
"expectedTraits": [
|
|
61
|
-
"has code block",
|
|
62
|
-
"reads before writing",
|
|
63
|
-
"runs tests after changes",
|
|
64
|
-
"explains the bug"
|
|
65
|
-
],
|
|
66
|
-
"agentExpectations": {
|
|
67
|
-
"expectedToolCalls": [
|
|
68
|
-
"read_file",
|
|
69
|
-
"grep_files",
|
|
70
|
-
"edit_file",
|
|
71
|
-
"run_shell"
|
|
72
|
-
],
|
|
73
|
-
"forbiddenToolCalls": [],
|
|
74
|
-
"maxTurns": 10,
|
|
75
|
-
"expectedFileChanges": [
|
|
76
|
-
"server.js"
|
|
77
|
-
],
|
|
78
|
-
"testCommand": "npm test",
|
|
79
|
-
"projectFixture": "express-buggy-items"
|
|
80
|
-
}
|
|
81
|
-
},
|
|
82
|
-
{
|
|
83
|
-
"id": "agent-004",
|
|
84
|
-
"prompt": "Create a config.js module that reads configuration from environment variables with sensible defaults: PORT (default 3000), NODE_ENV (default 'development'), LOG_LEVEL (default 'info'), and DATABASE_URL (default 'sqlite://local.db'). Export a frozen config object.",
|
|
85
|
-
"taskType": "coding-agent",
|
|
86
|
-
"difficulty": "easy",
|
|
87
|
-
"expectedTraits": [
|
|
88
|
-
"has code block",
|
|
89
|
-
"defines function",
|
|
90
|
-
"uses glob for discovery"
|
|
91
|
-
],
|
|
92
|
-
"agentExpectations": {
|
|
93
|
-
"expectedToolCalls": [
|
|
94
|
-
"glob",
|
|
95
|
-
"list_directory",
|
|
96
|
-
"write_file"
|
|
97
|
-
],
|
|
98
|
-
"forbiddenToolCalls": [],
|
|
99
|
-
"maxTurns": 8,
|
|
100
|
-
"expectedFileChanges": [
|
|
101
|
-
"config.js"
|
|
102
|
-
],
|
|
103
|
-
"testCommand": "npm test",
|
|
104
|
-
"projectFixture": "express-basic"
|
|
105
|
-
}
|
|
106
|
-
},
|
|
107
|
-
{
|
|
108
|
-
"id": "agent-005",
|
|
109
|
-
"prompt": "Create a middleware/logger.js file that logs each incoming request with method, URL, status code, and response time in milliseconds. It should write to stdout in the format: '[timestamp] METHOD /path STATUS duration_ms'.",
|
|
110
|
-
"taskType": "coding-agent",
|
|
111
|
-
"difficulty": "easy",
|
|
112
|
-
"expectedTraits": [
|
|
113
|
-
"has code block",
|
|
114
|
-
"defines function",
|
|
115
|
-
"uses glob for discovery"
|
|
116
|
-
],
|
|
117
|
-
"agentExpectations": {
|
|
118
|
-
"expectedToolCalls": [
|
|
119
|
-
"glob",
|
|
120
|
-
"list_directory",
|
|
121
|
-
"write_file",
|
|
122
|
-
"read_file"
|
|
123
|
-
],
|
|
124
|
-
"forbiddenToolCalls": [],
|
|
125
|
-
"maxTurns": 8,
|
|
126
|
-
"expectedFileChanges": [
|
|
127
|
-
"middleware/logger.js"
|
|
128
|
-
],
|
|
129
|
-
"testCommand": "npm test",
|
|
130
|
-
"projectFixture": "express-basic"
|
|
131
|
-
}
|
|
132
|
-
},
|
|
133
|
-
{
|
|
134
|
-
"id": "agent-006",
|
|
135
|
-
"prompt": "Extract all route handlers from server.js into a separate routes/index.js module using Express Router. The server.js should import and mount the router. Ensure all existing tests still pass.",
|
|
136
|
-
"taskType": "coding-agent",
|
|
137
|
-
"difficulty": "hard",
|
|
138
|
-
"expectedTraits": [
|
|
139
|
-
"has code block",
|
|
140
|
-
"reads before writing",
|
|
141
|
-
"runs tests after changes",
|
|
142
|
-
"multi-file coordination",
|
|
143
|
-
"efficient tool use"
|
|
144
|
-
],
|
|
145
|
-
"agentExpectations": {
|
|
146
|
-
"expectedToolCalls": [
|
|
147
|
-
"read_file",
|
|
148
|
-
"glob",
|
|
149
|
-
"write_file",
|
|
150
|
-
"edit_file",
|
|
151
|
-
"run_shell"
|
|
152
|
-
],
|
|
153
|
-
"forbiddenToolCalls": [],
|
|
154
|
-
"maxTurns": 15,
|
|
155
|
-
"expectedFileChanges": [
|
|
156
|
-
"package.json",
|
|
157
|
-
"package-lock.json",
|
|
158
|
-
"server.js",
|
|
159
|
-
"routes/index.js"
|
|
160
|
-
],
|
|
161
|
-
"testCommand": "npm test",
|
|
162
|
-
"projectFixture": "express-basic"
|
|
163
|
-
}
|
|
164
|
-
},
|
|
165
|
-
{
|
|
166
|
-
"id": "agent-007",
|
|
167
|
-
"prompt": "Rename the 'getData' function to 'fetchRecords' across all files in the project. Make sure imports, exports, function definitions, and all call sites are updated consistently.",
|
|
168
|
-
"taskType": "coding-agent",
|
|
169
|
-
"difficulty": "medium",
|
|
170
|
-
"expectedTraits": [
|
|
171
|
-
"reads before writing",
|
|
172
|
-
"uses grep for search",
|
|
173
|
-
"multi-file coordination",
|
|
174
|
-
"efficient tool use"
|
|
175
|
-
],
|
|
176
|
-
"agentExpectations": {
|
|
177
|
-
"expectedToolCalls": [
|
|
178
|
-
"grep_files",
|
|
179
|
-
"read_file",
|
|
180
|
-
"edit_file"
|
|
181
|
-
],
|
|
182
|
-
"forbiddenToolCalls": [],
|
|
183
|
-
"maxTurns": 12,
|
|
184
|
-
"expectedFileChanges": [
|
|
185
|
-
"data.js",
|
|
186
|
-
"server.js",
|
|
187
|
-
"test.js"
|
|
188
|
-
],
|
|
189
|
-
"testCommand": "npm test",
|
|
190
|
-
"projectFixture": "express-rename-data"
|
|
191
|
-
}
|
|
192
|
-
},
|
|
193
|
-
{
|
|
194
|
-
"id": "agent-008",
|
|
195
|
-
"prompt": "The test suite has 2 failing tests. Run the tests, diagnose each failure, fix the code (not the tests), and verify all tests pass. Do not modify any test files.",
|
|
196
|
-
"taskType": "coding-agent",
|
|
197
|
-
"difficulty": "hard",
|
|
198
|
-
"expectedTraits": [
|
|
199
|
-
"runs tests after changes",
|
|
200
|
-
"reads before writing",
|
|
201
|
-
"explains the bug",
|
|
202
|
-
"handles errors gracefully"
|
|
203
|
-
],
|
|
204
|
-
"agentExpectations": {
|
|
205
|
-
"expectedToolCalls": [
|
|
206
|
-
"run_shell",
|
|
207
|
-
"read_file",
|
|
208
|
-
"grep_files",
|
|
209
|
-
"edit_file"
|
|
210
|
-
],
|
|
211
|
-
"forbiddenToolCalls": [],
|
|
212
|
-
"maxTurns": 15,
|
|
213
|
-
"expectedFileChanges": [
|
|
214
|
-
"server.js"
|
|
215
|
-
],
|
|
216
|
-
"testCommand": "npm test",
|
|
217
|
-
"projectFixture": "express-buggy"
|
|
218
|
-
}
|
|
219
|
-
},
|
|
220
|
-
{
|
|
221
|
-
"id": "agent-009",
|
|
222
|
-
"prompt": "The GET /users endpoint will crash if the users data source returns undefined or null instead of an array. Add defensive checks to the /users handler so it returns an empty array instead of crashing. Also review the /items/:id endpoint for similar missing null checks. Run tests to verify nothing breaks.",
|
|
223
|
-
"taskType": "coding-agent",
|
|
224
|
-
"difficulty": "medium",
|
|
225
|
-
"expectedTraits": [
|
|
226
|
-
"has code block",
|
|
227
|
-
"reads before writing",
|
|
228
|
-
"runs tests after changes",
|
|
229
|
-
"handles errors gracefully"
|
|
230
|
-
],
|
|
231
|
-
"agentExpectations": {
|
|
232
|
-
"expectedToolCalls": [
|
|
233
|
-
"read_file",
|
|
234
|
-
"grep_files",
|
|
235
|
-
"edit_file",
|
|
236
|
-
"run_shell"
|
|
237
|
-
],
|
|
238
|
-
"forbiddenToolCalls": [],
|
|
239
|
-
"maxTurns": 12,
|
|
240
|
-
"expectedFileChanges": [
|
|
241
|
-
"server.js"
|
|
242
|
-
],
|
|
243
|
-
"testCommand": "npm test",
|
|
244
|
-
"projectFixture": "express-basic"
|
|
245
|
-
}
|
|
246
|
-
},
|
|
247
|
-
{
|
|
248
|
-
"id": "agent-010",
|
|
249
|
-
"prompt": "A test expects the /search endpoint to return results sorted by relevance score descending, but it's returning them in insertion order. Find the failing test, read the endpoint code, and fix the sorting. Don't change the tests.",
|
|
250
|
-
"taskType": "coding-agent",
|
|
251
|
-
"difficulty": "medium",
|
|
252
|
-
"expectedTraits": [
|
|
253
|
-
"runs tests after changes",
|
|
254
|
-
"reads before writing",
|
|
255
|
-
"uses grep for search",
|
|
256
|
-
"explains the bug"
|
|
257
|
-
],
|
|
258
|
-
"agentExpectations": {
|
|
259
|
-
"expectedToolCalls": [
|
|
260
|
-
"run_shell",
|
|
261
|
-
"read_file",
|
|
262
|
-
"grep_files",
|
|
263
|
-
"edit_file"
|
|
264
|
-
],
|
|
265
|
-
"forbiddenToolCalls": [],
|
|
266
|
-
"maxTurns": 10,
|
|
267
|
-
"expectedFileChanges": [
|
|
268
|
-
"server.js"
|
|
269
|
-
],
|
|
270
|
-
"testCommand": "npm test",
|
|
271
|
-
"projectFixture": "express-buggy-search"
|
|
272
|
-
}
|
|
273
|
-
},
|
|
274
|
-
{
|
|
275
|
-
"id": "agent-011",
|
|
276
|
-
"prompt": "Design and plan the migration from our monolithic Express app to a microservices architecture. Create a migration-plan.md with: current architecture analysis, proposed service boundaries, data ownership strategy, migration phases with timeline, and risk mitigation. Read the existing codebase first to understand current structure.",
|
|
277
|
-
"taskType": "coding-agent",
|
|
278
|
-
"difficulty": "hard",
|
|
279
|
-
"expectedTraits": [
|
|
280
|
-
"reads before writing",
|
|
281
|
-
"plans before executing",
|
|
282
|
-
"uses glob for discovery"
|
|
283
|
-
],
|
|
284
|
-
"agentExpectations": {
|
|
285
|
-
"expectedToolCalls": [
|
|
286
|
-
"glob",
|
|
287
|
-
"read_file",
|
|
288
|
-
"grep_files",
|
|
289
|
-
"write_file"
|
|
290
|
-
],
|
|
291
|
-
"forbiddenToolCalls": [],
|
|
292
|
-
"maxTurns": 12,
|
|
293
|
-
"expectedFileChanges": [
|
|
294
|
-
"migration-plan.md"
|
|
295
|
-
],
|
|
296
|
-
"testCommand": null,
|
|
297
|
-
"projectFixture": "express-basic"
|
|
298
|
-
}
|
|
299
|
-
},
|
|
300
|
-
{
|
|
301
|
-
"id": "agent-012",
|
|
302
|
-
"prompt": "Design a REST API for a todo-list feature. Create an api-design.md document that includes: endpoint definitions (CRUD), request/response schemas with examples, error response format, pagination strategy, and authentication requirements. Review existing patterns in the codebase first for consistency.",
|
|
303
|
-
"taskType": "coding-agent",
|
|
304
|
-
"difficulty": "medium",
|
|
305
|
-
"expectedTraits": [
|
|
306
|
-
"reads before writing",
|
|
307
|
-
"plans before executing",
|
|
308
|
-
"uses glob for discovery"
|
|
309
|
-
],
|
|
310
|
-
"agentExpectations": {
|
|
311
|
-
"expectedToolCalls": [
|
|
312
|
-
"glob",
|
|
313
|
-
"read_file",
|
|
314
|
-
"write_file"
|
|
315
|
-
],
|
|
316
|
-
"forbiddenToolCalls": [],
|
|
317
|
-
"maxTurns": 10,
|
|
318
|
-
"expectedFileChanges": [
|
|
319
|
-
"api-design.md"
|
|
320
|
-
],
|
|
321
|
-
"testCommand": null,
|
|
322
|
-
"projectFixture": "express-basic"
|
|
323
|
-
}
|
|
324
|
-
},
|
|
325
|
-
{
|
|
326
|
-
"id": "agent-013",
|
|
327
|
-
"prompt": "Implement JWT authentication middleware for the Express app. Create an auth.js middleware that verifies Bearer tokens from the Authorization header, extracts user info, and attaches it to req.user. Add a POST /login endpoint that accepts { username, password } and returns a JWT. Protect the existing GET / endpoint. Add tests for the auth flow.",
|
|
328
|
-
"taskType": "coding-agent",
|
|
329
|
-
"difficulty": "hard",
|
|
330
|
-
"expectedTraits": [
|
|
331
|
-
"has code block",
|
|
332
|
-
"reads before writing",
|
|
333
|
-
"runs tests after changes",
|
|
334
|
-
"multi-file coordination",
|
|
335
|
-
"efficient tool use"
|
|
336
|
-
],
|
|
337
|
-
"agentExpectations": {
|
|
338
|
-
"expectedToolCalls": [
|
|
339
|
-
"read_file",
|
|
340
|
-
"glob",
|
|
341
|
-
"write_file",
|
|
342
|
-
"edit_file",
|
|
343
|
-
"run_shell"
|
|
344
|
-
],
|
|
345
|
-
"forbiddenToolCalls": [],
|
|
346
|
-
"maxTurns": 18,
|
|
347
|
-
"expectedFileChanges": [
|
|
348
|
-
"auth.js",
|
|
349
|
-
"server.js",
|
|
350
|
-
"test.js"
|
|
351
|
-
],
|
|
352
|
-
"testCommand": "npm test",
|
|
353
|
-
"projectFixture": "express-basic"
|
|
354
|
-
}
|
|
355
|
-
},
|
|
356
|
-
{
|
|
357
|
-
"id": "agent-014",
|
|
358
|
-
"prompt": "Add a complete CRUD endpoint for 'projects' to the Express app: GET /projects (list all), GET /projects/:id (get one), POST /projects (create), PUT /projects/:id (update), DELETE /projects/:id (delete). Use an in-memory array as storage. Include input validation and proper HTTP status codes. Add tests for all endpoints.",
|
|
359
|
-
"taskType": "coding-agent",
|
|
360
|
-
"difficulty": "hard",
|
|
361
|
-
"expectedTraits": [
|
|
362
|
-
"has code block",
|
|
363
|
-
"reads before writing",
|
|
364
|
-
"runs tests after changes",
|
|
365
|
-
"multi-file coordination",
|
|
366
|
-
"handles errors gracefully"
|
|
367
|
-
],
|
|
368
|
-
"agentExpectations": {
|
|
369
|
-
"expectedToolCalls": [
|
|
370
|
-
"read_file",
|
|
371
|
-
"glob",
|
|
372
|
-
"edit_file",
|
|
373
|
-
"run_shell"
|
|
374
|
-
],
|
|
375
|
-
"forbiddenToolCalls": [],
|
|
376
|
-
"maxTurns": 18,
|
|
377
|
-
"expectedFileChanges": [
|
|
378
|
-
"server.js",
|
|
379
|
-
"test.js"
|
|
380
|
-
],
|
|
381
|
-
"testCommand": "npm test",
|
|
382
|
-
"projectFixture": "express-basic"
|
|
383
|
-
}
|
|
384
|
-
},
|
|
385
|
-
{
|
|
386
|
-
"id": "agent-015",
|
|
387
|
-
"prompt": "Add a 'greet' command to the CLI tool that accepts a --name flag and an optional --format flag (plain or json). 'mycli greet --name Alice' should output 'Hello, Alice!' and 'mycli greet --name Alice --format json' should output {\"greeting\": \"Hello, Alice!\"}. Add tests for both formats and missing name error handling.",
|
|
388
|
-
"taskType": "coding-agent",
|
|
389
|
-
"difficulty": "medium",
|
|
390
|
-
"expectedTraits": [
|
|
391
|
-
"has code block",
|
|
392
|
-
"reads before writing",
|
|
393
|
-
"runs tests after changes",
|
|
394
|
-
"handles errors gracefully"
|
|
395
|
-
],
|
|
396
|
-
"agentExpectations": {
|
|
397
|
-
"expectedToolCalls": [
|
|
398
|
-
"read_file",
|
|
399
|
-
"glob",
|
|
400
|
-
"edit_file",
|
|
401
|
-
"run_shell"
|
|
402
|
-
],
|
|
403
|
-
"forbiddenToolCalls": [],
|
|
404
|
-
"maxTurns": 12,
|
|
405
|
-
"expectedFileChanges": [
|
|
406
|
-
"index.js",
|
|
407
|
-
"test.js"
|
|
408
|
-
],
|
|
409
|
-
"testCommand": "npm test",
|
|
410
|
-
"projectFixture": "node-cli"
|
|
411
|
-
}
|
|
412
|
-
},
|
|
413
|
-
{
|
|
414
|
-
"id": "agent-016",
|
|
415
|
-
"prompt": "Add error handling middleware to the Express app. It should catch any unhandled errors from route handlers, log the error message to stderr, and return a 500 JSON response with { error: 'Internal Server Error' }. Make sure existing routes still work and add a test that triggers a 500 error.",
|
|
416
|
-
"taskType": "coding-agent",
|
|
417
|
-
"difficulty": "medium",
|
|
418
|
-
"expectedTraits": [
|
|
419
|
-
"has code block",
|
|
420
|
-
"reads before writing",
|
|
421
|
-
"runs tests after changes",
|
|
422
|
-
"handles errors gracefully"
|
|
423
|
-
],
|
|
424
|
-
"agentExpectations": {
|
|
425
|
-
"expectedToolCalls": [
|
|
426
|
-
"read_file",
|
|
427
|
-
"edit_file",
|
|
428
|
-
"run_shell"
|
|
429
|
-
],
|
|
430
|
-
"forbiddenToolCalls": [],
|
|
431
|
-
"maxTurns": 12,
|
|
432
|
-
"expectedFileChanges": [
|
|
433
|
-
"server.js",
|
|
434
|
-
"test.js"
|
|
435
|
-
],
|
|
436
|
-
"testCommand": "npm test",
|
|
437
|
-
"projectFixture": "express-basic"
|
|
438
|
-
}
|
|
439
|
-
},
|
|
440
|
-
{
|
|
441
|
-
"id": "agent-017",
|
|
442
|
-
"prompt": "Add rate limiting to the Express app. Create a simple in-memory rate limiter that allows max 100 requests per minute per IP. When the limit is exceeded, return 429 with { error: 'Too Many Requests', retryAfter: <seconds> }. Add it as middleware before all routes. Add tests that verify the rate limit works.",
|
|
443
|
-
"taskType": "coding-agent",
|
|
444
|
-
"difficulty": "hard",
|
|
445
|
-
"expectedTraits": [
|
|
446
|
-
"has code block",
|
|
447
|
-
"reads before writing",
|
|
448
|
-
"runs tests after changes",
|
|
449
|
-
"multi-file coordination"
|
|
450
|
-
],
|
|
451
|
-
"agentExpectations": {
|
|
452
|
-
"expectedToolCalls": [
|
|
453
|
-
"read_file",
|
|
454
|
-
"edit_file",
|
|
455
|
-
"write_file",
|
|
456
|
-
"run_shell"
|
|
457
|
-
],
|
|
458
|
-
"forbiddenToolCalls": [],
|
|
459
|
-
"maxTurns": 15,
|
|
460
|
-
"expectedFileChanges": [
|
|
461
|
-
"server.js",
|
|
462
|
-
"test.js"
|
|
463
|
-
],
|
|
464
|
-
"testCommand": "npm test",
|
|
465
|
-
"projectFixture": "express-basic"
|
|
466
|
-
}
|
|
467
|
-
},
|
|
468
|
-
{
|
|
469
|
-
"id": "agent-018",
|
|
470
|
-
"prompt": "The server currently uses url.parse() which is deprecated. Refactor all occurrences to use the WHATWG URL API (new URL()). Make sure the refactoring doesn't break any existing tests.",
|
|
471
|
-
"taskType": "coding-agent",
|
|
472
|
-
"difficulty": "medium",
|
|
473
|
-
"expectedTraits": [
|
|
474
|
-
"reads before writing",
|
|
475
|
-
"uses grep for search",
|
|
476
|
-
"runs tests after changes"
|
|
477
|
-
],
|
|
478
|
-
"agentExpectations": {
|
|
479
|
-
"expectedToolCalls": [
|
|
480
|
-
"grep_files",
|
|
481
|
-
"read_file",
|
|
482
|
-
"edit_file",
|
|
483
|
-
"run_shell"
|
|
484
|
-
],
|
|
485
|
-
"forbiddenToolCalls": [],
|
|
486
|
-
"maxTurns": 10,
|
|
487
|
-
"expectedFileChanges": [
|
|
488
|
-
"server.js"
|
|
489
|
-
],
|
|
490
|
-
"testCommand": "npm test",
|
|
491
|
-
"projectFixture": "express-basic"
|
|
492
|
-
}
|
|
493
|
-
},
|
|
494
|
-
{
|
|
495
|
-
"id": "agent-019",
|
|
496
|
-
"prompt": "Add request logging that writes to a log file. Create a logs/ directory and write each request as a JSON line to logs/access.log with: timestamp, method, path, status, duration_ms, and ip. Add a GET /logs endpoint that returns the last 50 log entries as JSON. Add tests.",
|
|
497
|
-
"taskType": "coding-agent",
|
|
498
|
-
"difficulty": "hard",
|
|
499
|
-
"expectedTraits": [
|
|
500
|
-
"has code block",
|
|
501
|
-
"reads before writing",
|
|
502
|
-
"runs tests after changes",
|
|
503
|
-
"multi-file coordination"
|
|
504
|
-
],
|
|
505
|
-
"agentExpectations": {
|
|
506
|
-
"expectedToolCalls": [
|
|
507
|
-
"read_file",
|
|
508
|
-
"edit_file",
|
|
509
|
-
"write_file",
|
|
510
|
-
"run_shell"
|
|
511
|
-
],
|
|
512
|
-
"forbiddenToolCalls": [],
|
|
513
|
-
"maxTurns": 18,
|
|
514
|
-
"expectedFileChanges": [
|
|
515
|
-
"server.js",
|
|
516
|
-
"test.js"
|
|
517
|
-
],
|
|
518
|
-
"testCommand": "npm test",
|
|
519
|
-
"projectFixture": "express-basic"
|
|
520
|
-
}
|
|
521
|
-
},
|
|
522
|
-
{
|
|
523
|
-
"id": "agent-020",
|
|
524
|
-
"prompt": "Write comprehensive tests for all existing endpoints in the Express app. The current test.js only has basic tests. Add tests for: POST /users with valid and invalid data, GET /search with query parameter, GET /items/:id with edge cases (non-numeric id, id=0), and 404 for unknown routes. Each test should verify both status code and response body.",
|
|
525
|
-
"taskType": "coding-agent",
|
|
526
|
-
"difficulty": "medium",
|
|
527
|
-
"expectedTraits": [
|
|
528
|
-
"reads before writing",
|
|
529
|
-
"runs tests after changes",
|
|
530
|
-
"efficient tool use"
|
|
531
|
-
],
|
|
532
|
-
"agentExpectations": {
|
|
533
|
-
"expectedToolCalls": [
|
|
534
|
-
"read_file",
|
|
535
|
-
"edit_file",
|
|
536
|
-
"run_shell"
|
|
537
|
-
],
|
|
538
|
-
"forbiddenToolCalls": [],
|
|
539
|
-
"maxTurns": 12,
|
|
540
|
-
"expectedFileChanges": [
|
|
541
|
-
"test.js"
|
|
542
|
-
],
|
|
543
|
-
"testCommand": "npm test",
|
|
544
|
-
"projectFixture": "express-basic"
|
|
545
|
-
}
|
|
546
|
-
},
|
|
547
|
-
{
|
|
548
|
-
"id": "agent-mt-001",
|
|
549
|
-
"prompt": "Add a /status endpoint returning JSON with { status: 'ok' }",
|
|
550
|
-
"multiTurn": true,
|
|
551
|
-
"turns": [
|
|
552
|
-
{
|
|
553
|
-
"prompt": "Add a /status endpoint to the Express app that returns JSON { status: 'ok' }",
|
|
554
|
-
"type": "initial"
|
|
555
|
-
},
|
|
556
|
-
{
|
|
557
|
-
"prompt": "Actually, include uptime and memory usage in the response too",
|
|
558
|
-
"type": "correction"
|
|
559
|
-
},
|
|
560
|
-
{
|
|
561
|
-
"prompt": "The memory output is in bytes, convert it to megabytes rounded to 2 decimal places",
|
|
562
|
-
"type": "refinement"
|
|
563
|
-
}
|
|
564
|
-
],
|
|
565
|
-
"taskType": "coding-agent",
|
|
566
|
-
"difficulty": "medium",
|
|
567
|
-
"expectedTraits": [
|
|
568
|
-
"reads before writing",
|
|
569
|
-
"uses edit over write",
|
|
570
|
-
"runs tests after changes"
|
|
571
|
-
],
|
|
572
|
-
"agentExpectations": {
|
|
573
|
-
"expectedToolCalls": [
|
|
574
|
-
"read_file",
|
|
575
|
-
"edit_file",
|
|
576
|
-
"run_shell"
|
|
577
|
-
],
|
|
578
|
-
"maxTurns": 15,
|
|
579
|
-
"expectedFileChanges": [
|
|
580
|
-
"server.js"
|
|
581
|
-
],
|
|
582
|
-
"testCommand": "npm test",
|
|
583
|
-
"projectFixture": "express-basic"
|
|
584
|
-
}
|
|
585
|
-
},
|
|
586
|
-
{
|
|
587
|
-
"id": "agent-mt-002",
|
|
588
|
-
"prompt": "Create a utility module with a capitalize function",
|
|
589
|
-
"multiTurn": true,
|
|
590
|
-
"turns": [
|
|
591
|
-
{
|
|
592
|
-
"prompt": "Create a utils.js module that exports a capitalize(str) function",
|
|
593
|
-
"type": "initial"
|
|
594
|
-
},
|
|
595
|
-
{
|
|
596
|
-
"prompt": "Add a truncate(str, maxLen) function that adds '...' if the string exceeds maxLen",
|
|
597
|
-
"type": "addition"
|
|
598
|
-
},
|
|
599
|
-
{
|
|
600
|
-
"prompt": "Add tests for both functions in test.js",
|
|
601
|
-
"type": "addition"
|
|
602
|
-
},
|
|
603
|
-
{
|
|
604
|
-
"prompt": "The truncate function should not add '...' if the string is exactly maxLen characters",
|
|
605
|
-
"type": "correction"
|
|
606
|
-
}
|
|
607
|
-
],
|
|
608
|
-
"taskType": "coding-agent",
|
|
609
|
-
"difficulty": "medium",
|
|
610
|
-
"expectedTraits": [
|
|
611
|
-
"has code block",
|
|
612
|
-
"reads before writing",
|
|
613
|
-
"runs tests after changes"
|
|
614
|
-
],
|
|
615
|
-
"agentExpectations": {
|
|
616
|
-
"expectedToolCalls": [
|
|
617
|
-
"read_file",
|
|
618
|
-
"write_file",
|
|
619
|
-
"edit_file",
|
|
620
|
-
"run_shell"
|
|
621
|
-
],
|
|
622
|
-
"maxTurns": 20,
|
|
623
|
-
"expectedFileChanges": [
|
|
624
|
-
"utils.js",
|
|
625
|
-
"test.js"
|
|
626
|
-
],
|
|
627
|
-
"testCommand": "npm test",
|
|
628
|
-
"projectFixture": "express-basic"
|
|
629
|
-
}
|
|
630
|
-
},
|
|
631
|
-
{
|
|
632
|
-
"id": "agent-mt-003",
|
|
633
|
-
"prompt": "Add pagination to the GET /users endpoint",
|
|
634
|
-
"multiTurn": true,
|
|
635
|
-
"turns": [
|
|
636
|
-
{
|
|
637
|
-
"prompt": "Add pagination to the GET /users endpoint using query params ?page=1&limit=10",
|
|
638
|
-
"type": "initial"
|
|
639
|
-
},
|
|
640
|
-
{
|
|
641
|
-
"prompt": "Return total count and page metadata in the response: { data: [...], meta: { page, limit, total, totalPages } }",
|
|
642
|
-
"type": "refinement"
|
|
643
|
-
},
|
|
644
|
-
{
|
|
645
|
-
"prompt": "Add a test that verifies pagination with 25 users returns correct page counts",
|
|
646
|
-
"type": "addition"
|
|
647
|
-
}
|
|
648
|
-
],
|
|
649
|
-
"taskType": "coding-agent",
|
|
650
|
-
"difficulty": "hard",
|
|
651
|
-
"expectedTraits": [
|
|
652
|
-
"reads before writing",
|
|
653
|
-
"uses edit over write",
|
|
654
|
-
"runs tests after changes"
|
|
655
|
-
],
|
|
656
|
-
"agentExpectations": {
|
|
657
|
-
"expectedToolCalls": [
|
|
658
|
-
"read_file",
|
|
659
|
-
"edit_file",
|
|
660
|
-
"run_shell"
|
|
661
|
-
],
|
|
662
|
-
"maxTurns": 18,
|
|
663
|
-
"expectedFileChanges": [
|
|
664
|
-
"server.js",
|
|
665
|
-
"test.js"
|
|
666
|
-
],
|
|
667
|
-
"testCommand": "npm test",
|
|
668
|
-
"projectFixture": "express-basic"
|
|
669
|
-
}
|
|
670
|
-
},
|
|
671
|
-
{
|
|
672
|
-
"id": "agent-mt-004",
|
|
673
|
-
"prompt": "Fix the sorting bug and add sort direction",
|
|
674
|
-
"multiTurn": true,
|
|
675
|
-
"turns": [
|
|
676
|
-
{
|
|
677
|
-
"prompt": "The /search endpoint returns results in wrong order. Find and fix the sorting bug.",
|
|
678
|
-
"type": "initial"
|
|
679
|
-
},
|
|
680
|
-
{
|
|
681
|
-
"prompt": "Now add a ?sort=asc|desc query parameter to control sort direction, defaulting to desc",
|
|
682
|
-
"type": "addition"
|
|
683
|
-
},
|
|
684
|
-
{
|
|
685
|
-
"prompt": "Add tests for both ascending and descending sort",
|
|
686
|
-
"type": "addition"
|
|
687
|
-
}
|
|
688
|
-
],
|
|
689
|
-
"taskType": "coding-agent",
|
|
690
|
-
"difficulty": "medium",
|
|
691
|
-
"expectedTraits": [
|
|
692
|
-
"reads before writing",
|
|
693
|
-
"runs tests after changes",
|
|
694
|
-
"uses grep for search"
|
|
695
|
-
],
|
|
696
|
-
"agentExpectations": {
|
|
697
|
-
"expectedToolCalls": [
|
|
698
|
-
"read_file",
|
|
699
|
-
"grep_files",
|
|
700
|
-
"edit_file",
|
|
701
|
-
"run_shell"
|
|
702
|
-
],
|
|
703
|
-
"maxTurns": 15,
|
|
704
|
-
"expectedFileChanges": [
|
|
705
|
-
"server.js",
|
|
706
|
-
"test.js"
|
|
707
|
-
],
|
|
708
|
-
"testCommand": "npm test",
|
|
709
|
-
"projectFixture": "express-buggy-search"
|
|
710
|
-
}
|
|
711
|
-
},
|
|
712
|
-
{
|
|
713
|
-
"id": "agent-mt-005",
|
|
714
|
-
"prompt": "Add request validation middleware",
|
|
715
|
-
"multiTurn": true,
|
|
716
|
-
"turns": [
|
|
717
|
-
{
|
|
718
|
-
"prompt": "Create a middleware/validate.js that validates request body against a schema",
|
|
719
|
-
"type": "initial"
|
|
720
|
-
},
|
|
721
|
-
{
|
|
722
|
-
"prompt": "The schema should support 'required', 'type', and 'maxLength' rules",
|
|
723
|
-
"type": "refinement"
|
|
724
|
-
},
|
|
725
|
-
{
|
|
726
|
-
"prompt": "Apply it to POST /users with schema: name (required, string, maxLength 100), email (required, string)",
|
|
727
|
-
"type": "addition"
|
|
728
|
-
},
|
|
729
|
-
{
|
|
730
|
-
"prompt": "Return all validation errors at once, not just the first one",
|
|
731
|
-
"type": "correction"
|
|
732
|
-
}
|
|
733
|
-
],
|
|
734
|
-
"taskType": "coding-agent",
|
|
735
|
-
"difficulty": "hard",
|
|
736
|
-
"expectedTraits": [
|
|
737
|
-
"has code block",
|
|
738
|
-
"reads before writing",
|
|
739
|
-
"runs tests after changes",
|
|
740
|
-
"multi-file coordination"
|
|
741
|
-
],
|
|
742
|
-
"agentExpectations": {
|
|
743
|
-
"expectedToolCalls": [
|
|
744
|
-
"read_file",
|
|
745
|
-
"glob",
|
|
746
|
-
"write_file",
|
|
747
|
-
"edit_file",
|
|
748
|
-
"run_shell"
|
|
749
|
-
],
|
|
750
|
-
"maxTurns": 20,
|
|
751
|
-
"expectedFileChanges": [
|
|
752
|
-
"middleware/validate.js",
|
|
753
|
-
"server.js",
|
|
754
|
-
"test.js"
|
|
755
|
-
],
|
|
756
|
-
"testCommand": "npm test",
|
|
757
|
-
"projectFixture": "express-basic"
|
|
758
|
-
}
|
|
759
|
-
},
|
|
760
|
-
{
|
|
761
|
-
"id": "agent-ambig-001",
|
|
762
|
-
"prompt": "Improve the API",
|
|
763
|
-
"taskType": "coding-agent",
|
|
764
|
-
"difficulty": "medium",
|
|
765
|
-
"expectedTraits": [
|
|
766
|
-
"reads before writing",
|
|
767
|
-
"uses glob for discovery"
|
|
768
|
-
],
|
|
769
|
-
"agentExpectations": {
|
|
770
|
-
"shouldAskUser": true,
|
|
771
|
-
"expectedToolCalls": [
|
|
772
|
-
"glob",
|
|
773
|
-
"read_file",
|
|
774
|
-
"ask_user"
|
|
775
|
-
],
|
|
776
|
-
"forbiddenToolCalls": [
|
|
777
|
-
"write_file",
|
|
778
|
-
"edit_file"
|
|
779
|
-
],
|
|
780
|
-
"maxTurns": 8,
|
|
781
|
-
"expectedFileChanges": [],
|
|
782
|
-
"projectFixture": "express-basic"
|
|
783
|
-
}
|
|
784
|
-
},
|
|
785
|
-
{
|
|
786
|
-
"id": "agent-ambig-002",
|
|
787
|
-
"prompt": "Make it faster",
|
|
788
|
-
"taskType": "coding-agent",
|
|
789
|
-
"difficulty": "medium",
|
|
790
|
-
"expectedTraits": [
|
|
791
|
-
"reads before writing",
|
|
792
|
-
"uses glob for discovery"
|
|
793
|
-
],
|
|
794
|
-
"agentExpectations": {
|
|
795
|
-
"shouldAskUser": true,
|
|
796
|
-
"expectedToolCalls": [
|
|
797
|
-
"glob",
|
|
798
|
-
"read_file",
|
|
799
|
-
"ask_user"
|
|
800
|
-
],
|
|
801
|
-
"forbiddenToolCalls": [
|
|
802
|
-
"write_file",
|
|
803
|
-
"edit_file"
|
|
804
|
-
],
|
|
805
|
-
"maxTurns": 8,
|
|
806
|
-
"expectedFileChanges": [],
|
|
807
|
-
"projectFixture": "express-basic"
|
|
808
|
-
}
|
|
809
|
-
},
|
|
810
|
-
{
|
|
811
|
-
"id": "agent-ambig-003",
|
|
812
|
-
"prompt": "Add some tests",
|
|
813
|
-
"taskType": "coding-agent",
|
|
814
|
-
"difficulty": "easy",
|
|
815
|
-
"expectedTraits": [
|
|
816
|
-
"reads before writing"
|
|
817
|
-
],
|
|
818
|
-
"agentExpectations": {
|
|
819
|
-
"shouldAskUser": true,
|
|
820
|
-
"expectedToolCalls": [
|
|
821
|
-
"glob",
|
|
822
|
-
"read_file",
|
|
823
|
-
"ask_user"
|
|
824
|
-
],
|
|
825
|
-
"forbiddenToolCalls": [
|
|
826
|
-
"write_file",
|
|
827
|
-
"edit_file"
|
|
828
|
-
],
|
|
829
|
-
"maxTurns": 8,
|
|
830
|
-
"expectedFileChanges": [],
|
|
831
|
-
"projectFixture": "express-basic"
|
|
832
|
-
}
|
|
833
|
-
},
|
|
834
|
-
{
|
|
835
|
-
"id": "agent-ambig-004",
|
|
836
|
-
"prompt": "Refactor the code",
|
|
837
|
-
"taskType": "coding-agent",
|
|
838
|
-
"difficulty": "medium",
|
|
839
|
-
"expectedTraits": [
|
|
840
|
-
"reads before writing",
|
|
841
|
-
"uses glob for discovery"
|
|
842
|
-
],
|
|
843
|
-
"agentExpectations": {
|
|
844
|
-
"shouldAskUser": true,
|
|
845
|
-
"expectedToolCalls": [
|
|
846
|
-
"glob",
|
|
847
|
-
"read_file",
|
|
848
|
-
"ask_user"
|
|
849
|
-
],
|
|
850
|
-
"forbiddenToolCalls": [
|
|
851
|
-
"write_file",
|
|
852
|
-
"edit_file"
|
|
853
|
-
],
|
|
854
|
-
"maxTurns": 8,
|
|
855
|
-
"expectedFileChanges": [],
|
|
856
|
-
"projectFixture": "express-basic"
|
|
857
|
-
}
|
|
858
|
-
},
|
|
859
|
-
{
|
|
860
|
-
"id": "agent-ambig-005",
|
|
861
|
-
"prompt": "Fix the bug",
|
|
862
|
-
"taskType": "coding-agent",
|
|
863
|
-
"difficulty": "easy",
|
|
864
|
-
"expectedTraits": [
|
|
865
|
-
"reads before writing",
|
|
866
|
-
"uses grep for search"
|
|
867
|
-
],
|
|
868
|
-
"agentExpectations": {
|
|
869
|
-
"shouldAskUser": true,
|
|
870
|
-
"expectedToolCalls": [
|
|
871
|
-
"glob",
|
|
872
|
-
"read_file",
|
|
873
|
-
"grep_files",
|
|
874
|
-
"ask_user"
|
|
875
|
-
],
|
|
876
|
-
"forbiddenToolCalls": [
|
|
877
|
-
"write_file",
|
|
878
|
-
"edit_file"
|
|
879
|
-
],
|
|
880
|
-
"maxTurns": 8,
|
|
881
|
-
"expectedFileChanges": [],
|
|
882
|
-
"projectFixture": "express-buggy"
|
|
883
|
-
}
|
|
884
|
-
},
|
|
885
|
-
{
|
|
886
|
-
"id": "agent-mono-001",
|
|
887
|
-
"prompt": "The shared package's validateEmail function is imported in the API server's routes.js but never actually called to validate user email on POST /users. Add email validation to the POST /users handler using the shared validateEmail function. Return 400 with { error: 'Invalid email' } on failure.",
|
|
888
|
-
"taskType": "coding-agent",
|
|
889
|
-
"difficulty": "medium",
|
|
890
|
-
"expectedTraits": [
|
|
891
|
-
"reads before writing",
|
|
892
|
-
"uses grep for search",
|
|
893
|
-
"multi-file coordination"
|
|
894
|
-
],
|
|
895
|
-
"agentExpectations": {
|
|
896
|
-
"expectedToolCalls": [
|
|
897
|
-
"glob",
|
|
898
|
-
"read_file",
|
|
899
|
-
"grep_files",
|
|
900
|
-
"edit_file"
|
|
901
|
-
],
|
|
902
|
-
"forbiddenToolCalls": [],
|
|
903
|
-
"maxTurns": 12,
|
|
904
|
-
"expectedFileChanges": [
|
|
905
|
-
"packages/api/routes.js"
|
|
906
|
-
],
|
|
907
|
-
"testCommand": "npm test",
|
|
908
|
-
"projectFixture": "monorepo-basic"
|
|
909
|
-
}
|
|
910
|
-
},
|
|
911
|
-
{
|
|
912
|
-
"id": "agent-mono-002",
|
|
913
|
-
"prompt": "Add a new shared utility function 'slugify(str)' that converts a string to a URL-friendly slug (lowercase, replace spaces with hyphens, remove non-alphanumeric chars except hyphens). Export it from packages/shared/index.js, then use it in the CLI package's commands.js to add a --slug flag to the 'format' command.",
|
|
914
|
-
"taskType": "coding-agent",
|
|
915
|
-
"difficulty": "hard",
|
|
916
|
-
"expectedTraits": [
|
|
917
|
-
"reads before writing",
|
|
918
|
-
"multi-file coordination",
|
|
919
|
-
"runs tests after changes"
|
|
920
|
-
],
|
|
921
|
-
"agentExpectations": {
|
|
922
|
-
"expectedToolCalls": [
|
|
923
|
-
"glob",
|
|
924
|
-
"read_file",
|
|
925
|
-
"edit_file",
|
|
926
|
-
"run_shell"
|
|
927
|
-
],
|
|
928
|
-
"forbiddenToolCalls": [],
|
|
929
|
-
"maxTurns": 15,
|
|
930
|
-
"expectedFileChanges": [
|
|
931
|
-
"packages/shared/formatters.js",
|
|
932
|
-
"packages/shared/index.js",
|
|
933
|
-
"packages/cli/commands.js"
|
|
934
|
-
],
|
|
935
|
-
"testCommand": "npm test",
|
|
936
|
-
"projectFixture": "monorepo-basic"
|
|
937
|
-
}
|
|
938
|
-
},
|
|
939
|
-
{
|
|
940
|
-
"id": "agent-fs-001",
|
|
941
|
-
"prompt": "The fullstack app's GET /api/items endpoint returns all items without pagination. Add pagination support: accept ?page=1&limit=20 query params, return { data: [...], meta: { page, limit, total, totalPages } }. Update the test to verify pagination works.",
|
|
942
|
-
"taskType": "coding-agent",
|
|
943
|
-
"difficulty": "medium",
|
|
944
|
-
"expectedTraits": [
|
|
945
|
-
"reads before writing",
|
|
946
|
-
"uses grep for search",
|
|
947
|
-
"runs tests after changes"
|
|
948
|
-
],
|
|
949
|
-
"agentExpectations": {
|
|
950
|
-
"expectedToolCalls": [
|
|
951
|
-
"glob",
|
|
952
|
-
"read_file",
|
|
953
|
-
"edit_file",
|
|
954
|
-
"run_shell"
|
|
955
|
-
],
|
|
956
|
-
"forbiddenToolCalls": [],
|
|
957
|
-
"maxTurns": 12,
|
|
958
|
-
"expectedFileChanges": [
|
|
959
|
-
"server/routes/items.js",
|
|
960
|
-
"test.js"
|
|
961
|
-
],
|
|
962
|
-
"testCommand": "npm test",
|
|
963
|
-
"projectFixture": "fullstack-app"
|
|
964
|
-
}
|
|
965
|
-
},
|
|
966
|
-
{
|
|
967
|
-
"id": "agent-fs-002",
|
|
968
|
-
"prompt": "Add rate limiting middleware to the fullstack app. Create server/middleware/rate-limit.js that limits each IP to 100 requests per minute. Apply it to all /api routes. Return 429 with { error: 'Too many requests' } when exceeded. Add a test for the rate limit behavior.",
|
|
969
|
-
"taskType": "coding-agent",
|
|
970
|
-
"difficulty": "hard",
|
|
971
|
-
"expectedTraits": [
|
|
972
|
-
"reads before writing",
|
|
973
|
-
"multi-file coordination",
|
|
974
|
-
"runs tests after changes"
|
|
975
|
-
],
|
|
976
|
-
"agentExpectations": {
|
|
977
|
-
"expectedToolCalls": [
|
|
978
|
-
"glob",
|
|
979
|
-
"read_file",
|
|
980
|
-
"write_file",
|
|
981
|
-
"edit_file",
|
|
982
|
-
"run_shell"
|
|
983
|
-
],
|
|
984
|
-
"forbiddenToolCalls": [],
|
|
985
|
-
"maxTurns": 15,
|
|
986
|
-
"expectedFileChanges": [
|
|
987
|
-
"server/middleware/rate-limit.js",
|
|
988
|
-
"server/router.js",
|
|
989
|
-
"test.js"
|
|
990
|
-
],
|
|
991
|
-
"testCommand": "npm test",
|
|
992
|
-
"projectFixture": "fullstack-app"
|
|
993
|
-
}
|
|
994
|
-
},
|
|
995
|
-
{
|
|
996
|
-
"id": "agent-wes-001",
|
|
997
|
-
"prompt": "The wall-e-subset scorer has a bug: the 'quality' dimension always returns 0 or 1 (see scoreFromTraits line 69: 'ratio >= 0.8 ? 1.0 : ratio'). This is actually correct for ratio but the quality weight (0.2) means it under-contributes. The real bug is in brain.js: the stats() method computes avgComposite but doesn't round it, leading to floating point noise. Fix stats() to round avgComposite to 3 decimal places, and add a test case verifying the fix.",
|
|
998
|
-
"taskType": "coding-agent",
|
|
999
|
-
"difficulty": "medium",
|
|
1000
|
-
"expectedTraits": [
|
|
1001
|
-
"reads before writing",
|
|
1002
|
-
"uses grep for search",
|
|
1003
|
-
"runs tests after changes",
|
|
1004
|
-
"explains the bug"
|
|
1005
|
-
],
|
|
1006
|
-
"agentExpectations": {
|
|
1007
|
-
"expectedToolCalls": [
|
|
1008
|
-
"read_file",
|
|
1009
|
-
"grep_files",
|
|
1010
|
-
"edit_file",
|
|
1011
|
-
"run_shell"
|
|
1012
|
-
],
|
|
1013
|
-
"forbiddenToolCalls": [],
|
|
1014
|
-
"maxTurns": 10,
|
|
1015
|
-
"expectedFileChanges": [
|
|
1016
|
-
"brain.js",
|
|
1017
|
-
"test.js"
|
|
1018
|
-
],
|
|
1019
|
-
"testCommand": "npm test",
|
|
1020
|
-
"projectFixture": "wall-e-subset"
|
|
1021
|
-
}
|
|
1022
|
-
},
|
|
1023
|
-
{
|
|
1024
|
-
"id": "agent-py-001",
|
|
1025
|
-
"prompt": "Add a /health endpoint to the Flask app that returns JSON with status 'ok' and uptime in seconds (time since the app started).",
|
|
1026
|
-
"taskType": "coding-agent",
|
|
1027
|
-
"difficulty": "easy",
|
|
1028
|
-
"expectedTraits": [
|
|
1029
|
-
"has code block",
|
|
1030
|
-
"reads before writing",
|
|
1031
|
-
"uses edit over write"
|
|
1032
|
-
],
|
|
1033
|
-
"agentExpectations": {
|
|
1034
|
-
"expectedToolCalls": [
|
|
1035
|
-
"read_file",
|
|
1036
|
-
"glob",
|
|
1037
|
-
"edit_file"
|
|
1038
|
-
],
|
|
1039
|
-
"forbiddenToolCalls": [],
|
|
1040
|
-
"maxTurns": 8,
|
|
1041
|
-
"expectedFileChanges": [
|
|
1042
|
-
"app.py"
|
|
1043
|
-
],
|
|
1044
|
-
"testCommand": "python -m pytest",
|
|
1045
|
-
"projectFixture": "python-flask"
|
|
1046
|
-
}
|
|
1047
|
-
},
|
|
1048
|
-
{
|
|
1049
|
-
"id": "agent-py-002",
|
|
1050
|
-
"prompt": "Add input validation to the Flask app: create a POST /items endpoint that accepts JSON with 'name' (required string, 1-100 chars) and 'price' (required number, > 0). Return 400 with descriptive errors on invalid input, 201 with the created item on success.",
|
|
1051
|
-
"taskType": "coding-agent",
|
|
1052
|
-
"difficulty": "medium",
|
|
1053
|
-
"expectedTraits": [
|
|
1054
|
-
"has code block",
|
|
1055
|
-
"reads before writing",
|
|
1056
|
-
"handles errors gracefully"
|
|
1057
|
-
],
|
|
1058
|
-
"agentExpectations": {
|
|
1059
|
-
"expectedToolCalls": [
|
|
1060
|
-
"read_file",
|
|
1061
|
-
"glob",
|
|
1062
|
-
"edit_file"
|
|
1063
|
-
],
|
|
1064
|
-
"forbiddenToolCalls": [],
|
|
1065
|
-
"maxTurns": 10,
|
|
1066
|
-
"expectedFileChanges": [
|
|
1067
|
-
"app.py"
|
|
1068
|
-
],
|
|
1069
|
-
"testCommand": "python -m pytest",
|
|
1070
|
-
"projectFixture": "python-flask"
|
|
1071
|
-
}
|
|
1072
|
-
},
|
|
1073
|
-
{
|
|
1074
|
-
"id": "agent-py-003",
|
|
1075
|
-
"prompt": "There's a bug in the Flask app: the GET / endpoint returns a datetime field that uses naive datetime (no timezone info), but the API contract requires UTC ISO 8601 format. Fix the serialization to always return timezone-aware UTC timestamps.",
|
|
1076
|
-
"taskType": "coding-agent",
|
|
1077
|
-
"difficulty": "medium",
|
|
1078
|
-
"expectedTraits": [
|
|
1079
|
-
"has code block",
|
|
1080
|
-
"reads before writing",
|
|
1081
|
-
"explains the bug"
|
|
1082
|
-
],
|
|
1083
|
-
"agentExpectations": {
|
|
1084
|
-
"expectedToolCalls": [
|
|
1085
|
-
"read_file",
|
|
1086
|
-
"grep_files",
|
|
1087
|
-
"edit_file"
|
|
1088
|
-
],
|
|
1089
|
-
"forbiddenToolCalls": [],
|
|
1090
|
-
"maxTurns": 10,
|
|
1091
|
-
"expectedFileChanges": [
|
|
1092
|
-
"app.py"
|
|
1093
|
-
],
|
|
1094
|
-
"testCommand": "python -m pytest",
|
|
1095
|
-
"projectFixture": "python-flask"
|
|
1096
|
-
}
|
|
1097
|
-
},
|
|
1098
|
-
{
|
|
1099
|
-
"id": "agent-py-004",
|
|
1100
|
-
"prompt": "Add a SQLite persistence layer to the Flask app. Create a db.py module that initializes a SQLite database with an 'items' table (id INTEGER PRIMARY KEY, name TEXT, price REAL, created_at TEXT). Add functions to insert and retrieve items. Wire it into the Flask app so POST /items stores to the database and GET /items reads from it.",
|
|
1101
|
-
"taskType": "coding-agent",
|
|
1102
|
-
"difficulty": "hard",
|
|
1103
|
-
"expectedTraits": [
|
|
1104
|
-
"has code block",
|
|
1105
|
-
"reads before writing",
|
|
1106
|
-
"multi-file coordination",
|
|
1107
|
-
"runs tests after changes"
|
|
1108
|
-
],
|
|
1109
|
-
"agentExpectations": {
|
|
1110
|
-
"expectedToolCalls": [
|
|
1111
|
-
"read_file",
|
|
1112
|
-
"glob",
|
|
1113
|
-
"write_file",
|
|
1114
|
-
"edit_file",
|
|
1115
|
-
"run_shell"
|
|
1116
|
-
],
|
|
1117
|
-
"forbiddenToolCalls": [],
|
|
1118
|
-
"maxTurns": 18,
|
|
1119
|
-
"expectedFileChanges": [
|
|
1120
|
-
"db.py",
|
|
1121
|
-
"app.py"
|
|
1122
|
-
],
|
|
1123
|
-
"testCommand": "python -m pytest",
|
|
1124
|
-
"projectFixture": "python-flask"
|
|
1125
|
-
}
|
|
1126
|
-
},
|
|
1127
|
-
{
|
|
1128
|
-
"id": "agent-py-005",
|
|
1129
|
-
"prompt": "Refactor the Flask app: split the monolithic app.py into blueprints. Create a blueprints/ directory with items.py and health.py. Move route handlers into their respective blueprints. Create a shared config.py for app configuration. Ensure all existing tests still pass.",
|
|
1130
|
-
"taskType": "coding-agent",
|
|
1131
|
-
"difficulty": "hard",
|
|
1132
|
-
"expectedTraits": [
|
|
1133
|
-
"has code block",
|
|
1134
|
-
"reads before writing",
|
|
1135
|
-
"multi-file coordination",
|
|
1136
|
-
"runs tests after changes"
|
|
1137
|
-
],
|
|
1138
|
-
"agentExpectations": {
|
|
1139
|
-
"expectedToolCalls": [
|
|
1140
|
-
"read_file",
|
|
1141
|
-
"glob",
|
|
1142
|
-
"write_file",
|
|
1143
|
-
"edit_file",
|
|
1144
|
-
"run_shell"
|
|
1145
|
-
],
|
|
1146
|
-
"forbiddenToolCalls": [],
|
|
1147
|
-
"maxTurns": 20,
|
|
1148
|
-
"expectedFileChanges": [
|
|
1149
|
-
"app.py",
|
|
1150
|
-
"config.py",
|
|
1151
|
-
"blueprints/items.py",
|
|
1152
|
-
"blueprints/health.py"
|
|
1153
|
-
],
|
|
1154
|
-
"testCommand": "python -m pytest",
|
|
1155
|
-
"projectFixture": "python-flask"
|
|
1156
|
-
}
|
|
1157
|
-
},
|
|
1158
|
-
{
|
|
1159
|
-
"id": "agent-lsp-001",
|
|
1160
|
-
"prompt": "Rename the function 'processData' to 'transformPayload' across the entire project. Update all callers, imports, exports, and tests. Use LSP references to find all usages before making changes.",
|
|
1161
|
-
"taskType": "coding-agent",
|
|
1162
|
-
"difficulty": "medium",
|
|
1163
|
-
"expectedTraits": [
|
|
1164
|
-
"reads before writing",
|
|
1165
|
-
"uses grep for search",
|
|
1166
|
-
"multi-file coordination",
|
|
1167
|
-
"efficient tool use"
|
|
1168
|
-
],
|
|
1169
|
-
"agentExpectations": {
|
|
1170
|
-
"expectedToolCalls": [
|
|
1171
|
-
"read_file",
|
|
1172
|
-
"grep_files",
|
|
1173
|
-
"edit_file",
|
|
1174
|
-
"lsp_references"
|
|
1175
|
-
],
|
|
1176
|
-
"forbiddenToolCalls": [],
|
|
1177
|
-
"maxTurns": 12,
|
|
1178
|
-
"expectedFileChanges": [
|
|
1179
|
-
"server.js",
|
|
1180
|
-
"test.js"
|
|
1181
|
-
],
|
|
1182
|
-
"testCommand": "npm test",
|
|
1183
|
-
"projectFixture": "express-basic"
|
|
1184
|
-
}
|
|
1185
|
-
},
|
|
1186
|
-
{
|
|
1187
|
-
"id": "agent-lsp-002",
|
|
1188
|
-
"prompt": "This project uses JSDoc type annotations but has type errors. Run diagnostics to find all type-related issues, then fix them. The function signatures should match their documented types.",
|
|
1189
|
-
"taskType": "coding-agent",
|
|
1190
|
-
"difficulty": "medium",
|
|
1191
|
-
"expectedTraits": [
|
|
1192
|
-
"reads before writing",
|
|
1193
|
-
"uses LSP diagnostics",
|
|
1194
|
-
"runs tests after changes"
|
|
1195
|
-
],
|
|
1196
|
-
"agentExpectations": {
|
|
1197
|
-
"expectedToolCalls": [
|
|
1198
|
-
"read_file",
|
|
1199
|
-
"glob",
|
|
1200
|
-
"edit_file",
|
|
1201
|
-
"lsp_diagnostics"
|
|
1202
|
-
],
|
|
1203
|
-
"forbiddenToolCalls": [],
|
|
1204
|
-
"maxTurns": 15,
|
|
1205
|
-
"expectedFileChanges": [
|
|
1206
|
-
"src/utils.js",
|
|
1207
|
-
"src/handlers.js"
|
|
1208
|
-
],
|
|
1209
|
-
"testCommand": "npm test",
|
|
1210
|
-
"projectFixture": "node-typed"
|
|
1211
|
-
}
|
|
1212
|
-
},
|
|
1213
|
-
{
|
|
1214
|
-
"id": "agent-lsp-003",
|
|
1215
|
-
"prompt": "Extract all inline SQL queries from the route handlers into a separate db.js module. Use LSP to find all references to the database connection, create a clean data access layer, and update all imports. Ensure tests still pass.",
|
|
1216
|
-
"taskType": "coding-agent",
|
|
1217
|
-
"difficulty": "hard",
|
|
1218
|
-
"expectedTraits": [
|
|
1219
|
-
"reads before writing",
|
|
1220
|
-
"multi-file coordination",
|
|
1221
|
-
"runs tests after changes"
|
|
1222
|
-
],
|
|
1223
|
-
"agentExpectations": {
|
|
1224
|
-
"expectedToolCalls": [
|
|
1225
|
-
"read_file",
|
|
1226
|
-
"grep_files",
|
|
1227
|
-
"write_file",
|
|
1228
|
-
"edit_file",
|
|
1229
|
-
"lsp_definition",
|
|
1230
|
-
"lsp_references",
|
|
1231
|
-
"run_shell"
|
|
1232
|
-
],
|
|
1233
|
-
"forbiddenToolCalls": [],
|
|
1234
|
-
"maxTurns": 18,
|
|
1235
|
-
"expectedFileChanges": [
|
|
1236
|
-
"server/db.js",
|
|
1237
|
-
"server/routes/items.js",
|
|
1238
|
-
"server/routes/users.js"
|
|
1239
|
-
],
|
|
1240
|
-
"testCommand": "npm test",
|
|
1241
|
-
"projectFixture": "fullstack-app"
|
|
1242
|
-
}
|
|
1243
|
-
},
|
|
1244
|
-
{
|
|
1245
|
-
"id": "agent-sec-001",
|
|
1246
|
-
"prompt": "The user login endpoint in this app has a SQL injection vulnerability. The query uses string concatenation instead of parameterized queries. Find and fix the SQL injection vulnerability, and add a test that proves the injection is no longer possible.",
|
|
1247
|
-
"taskType": "coding-agent",
|
|
1248
|
-
"difficulty": "medium",
|
|
1249
|
-
"expectedTraits": [
|
|
1250
|
-
"has code block",
|
|
1251
|
-
"reads before writing",
|
|
1252
|
-
"uses grep for search",
|
|
1253
|
-
"runs tests after changes"
|
|
1254
|
-
],
|
|
1255
|
-
"agentExpectations": {
|
|
1256
|
-
"expectedToolCalls": [
|
|
1257
|
-
"read_file",
|
|
1258
|
-
"grep_files",
|
|
1259
|
-
"edit_file",
|
|
1260
|
-
"run_shell"
|
|
1261
|
-
],
|
|
1262
|
-
"forbiddenToolCalls": [],
|
|
1263
|
-
"maxTurns": 12,
|
|
1264
|
-
"expectedFileChanges": [
|
|
1265
|
-
"server.js",
|
|
1266
|
-
"test.js"
|
|
1267
|
-
],
|
|
1268
|
-
"testCommand": "npm test",
|
|
1269
|
-
"projectFixture": "express-buggy"
|
|
1270
|
-
}
|
|
1271
|
-
},
|
|
1272
|
-
{
|
|
1273
|
-
"id": "agent-sec-002",
|
|
1274
|
-
"prompt": "This Express app renders user input directly in HTML templates without sanitization, creating XSS vulnerabilities. Find all places where user input is rendered unsanitized, fix them by adding proper escaping/sanitization, and add tests to verify the fixes.",
|
|
1275
|
-
"taskType": "coding-agent",
|
|
1276
|
-
"difficulty": "medium",
|
|
1277
|
-
"expectedTraits": [
|
|
1278
|
-
"has code block",
|
|
1279
|
-
"reads before writing",
|
|
1280
|
-
"uses grep for search",
|
|
1281
|
-
"runs tests after changes"
|
|
1282
|
-
],
|
|
1283
|
-
"agentExpectations": {
|
|
1284
|
-
"expectedToolCalls": [
|
|
1285
|
-
"read_file",
|
|
1286
|
-
"grep_files",
|
|
1287
|
-
"edit_file",
|
|
1288
|
-
"run_shell"
|
|
1289
|
-
],
|
|
1290
|
-
"forbiddenToolCalls": [],
|
|
1291
|
-
"maxTurns": 15,
|
|
1292
|
-
"expectedFileChanges": [
|
|
1293
|
-
"server.js",
|
|
1294
|
-
"views/profile.ejs",
|
|
1295
|
-
"test.js"
|
|
1296
|
-
],
|
|
1297
|
-
"testCommand": "npm test",
|
|
1298
|
-
"projectFixture": "express-xss"
|
|
1299
|
-
}
|
|
1300
|
-
},
|
|
1301
|
-
{
|
|
1302
|
-
"id": "agent-sec-003",
|
|
1303
|
-
"prompt": "Add rate limiting and CSRF protection to all POST routes in the Express app. Create a rate limiter middleware (100 requests per minute per IP) and a CSRF token middleware. Apply both to all POST endpoints. Add tests verifying rate limiting returns 429 and CSRF validation rejects requests without valid tokens.",
|
|
1304
|
-
"taskType": "coding-agent",
|
|
1305
|
-
"difficulty": "hard",
|
|
1306
|
-
"expectedTraits": [
|
|
1307
|
-
"has code block",
|
|
1308
|
-
"reads before writing",
|
|
1309
|
-
"multi-file coordination",
|
|
1310
|
-
"runs tests after changes"
|
|
1311
|
-
],
|
|
1312
|
-
"agentExpectations": {
|
|
1313
|
-
"expectedToolCalls": [
|
|
1314
|
-
"read_file",
|
|
1315
|
-
"glob",
|
|
1316
|
-
"write_file",
|
|
1317
|
-
"edit_file",
|
|
1318
|
-
"run_shell"
|
|
1319
|
-
],
|
|
1320
|
-
"forbiddenToolCalls": [],
|
|
1321
|
-
"maxTurns": 18,
|
|
1322
|
-
"expectedFileChanges": [
|
|
1323
|
-
"server.js",
|
|
1324
|
-
"middleware/rate-limit.js",
|
|
1325
|
-
"middleware/csrf.js",
|
|
1326
|
-
"test.js"
|
|
1327
|
-
],
|
|
1328
|
-
"testCommand": "npm test",
|
|
1329
|
-
"projectFixture": "express-basic"
|
|
1330
|
-
}
|
|
1331
|
-
},
|
|
1332
|
-
{
|
|
1333
|
-
"id": "agent-sec-004",
|
|
1334
|
-
"prompt": "Audit the project's dependencies for known security vulnerabilities. Run npm audit, identify vulnerable packages, upgrade them to safe versions, and verify no breaking changes were introduced. Document findings in a SECURITY.md file.",
|
|
1335
|
-
"taskType": "coding-agent",
|
|
1336
|
-
"difficulty": "hard",
|
|
1337
|
-
"expectedTraits": [
|
|
1338
|
-
"reads before writing",
|
|
1339
|
-
"runs tests after changes"
|
|
1340
|
-
],
|
|
1341
|
-
"agentExpectations": {
|
|
1342
|
-
"expectedToolCalls": [
|
|
1343
|
-
"run_shell",
|
|
1344
|
-
"read_file",
|
|
1345
|
-
"edit_file",
|
|
1346
|
-
"write_file"
|
|
1347
|
-
],
|
|
1348
|
-
"forbiddenToolCalls": [],
|
|
1349
|
-
"maxTurns": 15,
|
|
1350
|
-
"expectedFileChanges": [
|
|
1351
|
-
"package.json",
|
|
1352
|
-
"SECURITY.md"
|
|
1353
|
-
],
|
|
1354
|
-
"testCommand": "npm test",
|
|
1355
|
-
"projectFixture": "fullstack-app"
|
|
1356
|
-
}
|
|
1357
|
-
},
|
|
1358
|
-
{
|
|
1359
|
-
"id": "agent-mf-001",
|
|
1360
|
-
"prompt": "Add pagination to the GET /api/items endpoint: accept ?page and ?limit query params. Also update the frontend public/js/items.js to display page controls (Previous/Next buttons) and call the paginated API. Return pagination metadata in the API response.",
|
|
1361
|
-
"taskType": "coding-agent",
|
|
1362
|
-
"difficulty": "medium",
|
|
1363
|
-
"expectedTraits": [
|
|
1364
|
-
"reads before writing",
|
|
1365
|
-
"multi-file coordination",
|
|
1366
|
-
"runs tests after changes"
|
|
1367
|
-
],
|
|
1368
|
-
"agentExpectations": {
|
|
1369
|
-
"expectedToolCalls": [
|
|
1370
|
-
"glob",
|
|
1371
|
-
"read_file",
|
|
1372
|
-
"edit_file",
|
|
1373
|
-
"run_shell"
|
|
1374
|
-
],
|
|
1375
|
-
"forbiddenToolCalls": [],
|
|
1376
|
-
"maxTurns": 15,
|
|
1377
|
-
"expectedFileChanges": [
|
|
1378
|
-
"server/routes/items.js",
|
|
1379
|
-
"public/js/items.js",
|
|
1380
|
-
"test.js"
|
|
1381
|
-
],
|
|
1382
|
-
"testCommand": "npm test",
|
|
1383
|
-
"projectFixture": "fullstack-app"
|
|
1384
|
-
}
|
|
1385
|
-
},
|
|
1386
|
-
{
|
|
1387
|
-
"id": "agent-mf-002",
|
|
1388
|
-
"prompt": "Add authentication middleware to the fullstack app. Create a JWT-based auth middleware that protects all /api routes except POST /api/auth/login. Add a login page at public/login.html. Update the frontend to include the JWT token in all API requests. Add tests for protected and unprotected routes.",
|
|
1389
|
-
"taskType": "coding-agent",
|
|
1390
|
-
"difficulty": "hard",
|
|
1391
|
-
"expectedTraits": [
|
|
1392
|
-
"reads before writing",
|
|
1393
|
-
"multi-file coordination",
|
|
1394
|
-
"runs tests after changes"
|
|
1395
|
-
],
|
|
1396
|
-
"agentExpectations": {
|
|
1397
|
-
"expectedToolCalls": [
|
|
1398
|
-
"glob",
|
|
1399
|
-
"read_file",
|
|
1400
|
-
"write_file",
|
|
1401
|
-
"edit_file",
|
|
1402
|
-
"run_shell"
|
|
1403
|
-
],
|
|
1404
|
-
"forbiddenToolCalls": [],
|
|
1405
|
-
"maxTurns": 20,
|
|
1406
|
-
"expectedFileChanges": [
|
|
1407
|
-
"server/middleware/auth.js",
|
|
1408
|
-
"server/routes/auth.js",
|
|
1409
|
-
"server/router.js",
|
|
1410
|
-
"public/login.html",
|
|
1411
|
-
"test.js"
|
|
1412
|
-
],
|
|
1413
|
-
"testCommand": "npm test",
|
|
1414
|
-
"projectFixture": "fullstack-app"
|
|
1415
|
-
}
|
|
1416
|
-
},
|
|
1417
|
-
{
|
|
1418
|
-
"id": "agent-mf-003",
|
|
1419
|
-
"prompt": "Convert the monorepo from CommonJS (require/module.exports) to ES Modules (import/export). Update all package.json files to add \"type\": \"module\", convert all require() calls to import statements, convert all module.exports to export statements. Ensure all tests pass after conversion.",
|
|
1420
|
-
"taskType": "coding-agent",
|
|
1421
|
-
"difficulty": "hard",
|
|
1422
|
-
"expectedTraits": [
|
|
1423
|
-
"reads before writing",
|
|
1424
|
-
"uses grep for search",
|
|
1425
|
-
"multi-file coordination",
|
|
1426
|
-
"runs tests after changes"
|
|
1427
|
-
],
|
|
1428
|
-
"agentExpectations": {
|
|
1429
|
-
"expectedToolCalls": [
|
|
1430
|
-
"glob",
|
|
1431
|
-
"grep_files",
|
|
1432
|
-
"read_file",
|
|
1433
|
-
"edit_file",
|
|
1434
|
-
"run_shell"
|
|
1435
|
-
],
|
|
1436
|
-
"forbiddenToolCalls": [],
|
|
1437
|
-
"maxTurns": 20,
|
|
1438
|
-
"expectedFileChanges": [
|
|
1439
|
-
"package.json",
|
|
1440
|
-
"packages/shared/index.js",
|
|
1441
|
-
"packages/shared/formatters.js",
|
|
1442
|
-
"packages/shared/validators.js",
|
|
1443
|
-
"packages/api/server.js",
|
|
1444
|
-
"packages/api/routes.js",
|
|
1445
|
-
"packages/cli/index.js",
|
|
1446
|
-
"packages/cli/commands.js"
|
|
1447
|
-
],
|
|
1448
|
-
"testCommand": "npm test",
|
|
1449
|
-
"projectFixture": "monorepo-basic"
|
|
1450
|
-
}
|
|
1451
|
-
},
|
|
1452
|
-
{
|
|
1453
|
-
"id": "agent-mf-004",
|
|
1454
|
-
"prompt": "Add a CI/CD pipeline and containerization to the fullstack app. Create a .github/workflows/ci.yml that runs tests on push, a Dockerfile for the app, and a docker-compose.yml that includes the app and a volume for the SQLite database. The CI should lint, test, and build the Docker image.",
|
|
1455
|
-
"taskType": "coding-agent",
|
|
1456
|
-
"difficulty": "hard",
|
|
1457
|
-
"expectedTraits": [
|
|
1458
|
-
"reads before writing",
|
|
1459
|
-
"multi-file coordination"
|
|
1460
|
-
],
|
|
1461
|
-
"agentExpectations": {
|
|
1462
|
-
"expectedToolCalls": [
|
|
1463
|
-
"glob",
|
|
1464
|
-
"read_file",
|
|
1465
|
-
"write_file"
|
|
1466
|
-
],
|
|
1467
|
-
"forbiddenToolCalls": [],
|
|
1468
|
-
"maxTurns": 15,
|
|
1469
|
-
"expectedFileChanges": [
|
|
1470
|
-
".github/workflows/ci.yml",
|
|
1471
|
-
"Dockerfile",
|
|
1472
|
-
"docker-compose.yml"
|
|
1473
|
-
],
|
|
1474
|
-
"testCommand": null,
|
|
1475
|
-
"projectFixture": "fullstack-app"
|
|
1476
|
-
}
|
|
1477
|
-
},
|
|
1478
|
-
{
|
|
1479
|
-
"id": "agent-ambig-006",
|
|
1480
|
-
"prompt": "Make the API faster",
|
|
1481
|
-
"taskType": "coding-agent",
|
|
1482
|
-
"difficulty": "medium",
|
|
1483
|
-
"expectedTraits": [
|
|
1484
|
-
"reads before writing",
|
|
1485
|
-
"uses glob for discovery"
|
|
1486
|
-
],
|
|
1487
|
-
"agentExpectations": {
|
|
1488
|
-
"shouldAskUser": true,
|
|
1489
|
-
"expectedToolCalls": [
|
|
1490
|
-
"glob",
|
|
1491
|
-
"read_file",
|
|
1492
|
-
"ask_user"
|
|
1493
|
-
],
|
|
1494
|
-
"forbiddenToolCalls": [
|
|
1495
|
-
"write_file",
|
|
1496
|
-
"edit_file"
|
|
1497
|
-
],
|
|
1498
|
-
"maxTurns": 8,
|
|
1499
|
-
"expectedFileChanges": [],
|
|
1500
|
-
"projectFixture": "fullstack-app"
|
|
1501
|
-
}
|
|
1502
|
-
},
|
|
1503
|
-
{
|
|
1504
|
-
"id": "agent-ambig-007",
|
|
1505
|
-
"prompt": "Add caching",
|
|
1506
|
-
"taskType": "coding-agent",
|
|
1507
|
-
"difficulty": "medium",
|
|
1508
|
-
"expectedTraits": [
|
|
1509
|
-
"reads before writing",
|
|
1510
|
-
"uses glob for discovery"
|
|
1511
|
-
],
|
|
1512
|
-
"agentExpectations": {
|
|
1513
|
-
"shouldAskUser": true,
|
|
1514
|
-
"expectedToolCalls": [
|
|
1515
|
-
"glob",
|
|
1516
|
-
"read_file",
|
|
1517
|
-
"ask_user"
|
|
1518
|
-
],
|
|
1519
|
-
"forbiddenToolCalls": [
|
|
1520
|
-
"write_file",
|
|
1521
|
-
"edit_file"
|
|
1522
|
-
],
|
|
1523
|
-
"maxTurns": 8,
|
|
1524
|
-
"expectedFileChanges": [],
|
|
1525
|
-
"projectFixture": "express-basic"
|
|
1526
|
-
}
|
|
1527
|
-
},
|
|
1528
|
-
{
|
|
1529
|
-
"id": "agent-ambig-008",
|
|
1530
|
-
"prompt": "Refactor the database layer",
|
|
1531
|
-
"taskType": "coding-agent",
|
|
1532
|
-
"difficulty": "hard",
|
|
1533
|
-
"expectedTraits": [
|
|
1534
|
-
"reads before writing",
|
|
1535
|
-
"uses glob for discovery"
|
|
1536
|
-
],
|
|
1537
|
-
"agentExpectations": {
|
|
1538
|
-
"shouldAskUser": true,
|
|
1539
|
-
"expectedToolCalls": [
|
|
1540
|
-
"glob",
|
|
1541
|
-
"read_file",
|
|
1542
|
-
"grep_files",
|
|
1543
|
-
"ask_user"
|
|
1544
|
-
],
|
|
1545
|
-
"forbiddenToolCalls": [
|
|
1546
|
-
"write_file",
|
|
1547
|
-
"edit_file"
|
|
1548
|
-
],
|
|
1549
|
-
"maxTurns": 8,
|
|
1550
|
-
"expectedFileChanges": [],
|
|
1551
|
-
"projectFixture": "fullstack-app"
|
|
1552
|
-
}
|
|
1553
|
-
},
|
|
1554
|
-
{
|
|
1555
|
-
"id": "agent-ambig-009",
|
|
1556
|
-
"prompt": "The tests are flaky, fix them",
|
|
1557
|
-
"taskType": "coding-agent",
|
|
1558
|
-
"difficulty": "hard",
|
|
1559
|
-
"expectedTraits": [
|
|
1560
|
-
"reads before writing",
|
|
1561
|
-
"runs tests after changes",
|
|
1562
|
-
"uses grep for search"
|
|
1563
|
-
],
|
|
1564
|
-
"agentExpectations": {
|
|
1565
|
-
"shouldAskUser": true,
|
|
1566
|
-
"expectedToolCalls": [
|
|
1567
|
-
"run_shell",
|
|
1568
|
-
"read_file",
|
|
1569
|
-
"grep_files",
|
|
1570
|
-
"ask_user"
|
|
1571
|
-
],
|
|
1572
|
-
"forbiddenToolCalls": [
|
|
1573
|
-
"write_file",
|
|
1574
|
-
"edit_file"
|
|
1575
|
-
],
|
|
1576
|
-
"maxTurns": 10,
|
|
1577
|
-
"expectedFileChanges": [],
|
|
1578
|
-
"projectFixture": "fullstack-app"
|
|
1579
|
-
}
|
|
1580
|
-
}
|
|
1581
|
-
]
|