cawdex 1.35.74 → 1.35.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +5 -5
  2. package/bin/anycode.js +2 -2
  3. package/bin/cawdex.js +408 -408
  4. package/bin/ecc-hooks.cjs +11 -11
  5. package/dist/agents-md.d.ts +31 -0
  6. package/dist/agents-md.js +340 -0
  7. package/dist/agents-md.js.map +1 -0
  8. package/dist/agents.js +1424 -1424
  9. package/dist/api.d.ts +1 -0
  10. package/dist/api.js +19 -14
  11. package/dist/api.js.map +1 -1
  12. package/dist/autonomous-loops.js +287 -287
  13. package/dist/benchmark-repos.d.ts +31 -0
  14. package/dist/benchmark-repos.js +234 -8
  15. package/dist/benchmark-repos.js.map +1 -1
  16. package/dist/command-palette.js +4 -2
  17. package/dist/command-palette.js.map +1 -1
  18. package/dist/compaction.js +8 -8
  19. package/dist/config.js +51 -36
  20. package/dist/config.js.map +1 -1
  21. package/dist/content-engine.js +543 -543
  22. package/dist/context-brief.d.ts +4 -0
  23. package/dist/context-brief.js +230 -0
  24. package/dist/context-brief.js.map +1 -0
  25. package/dist/cost-tracker.d.ts +33 -14
  26. package/dist/cost-tracker.js +81 -19
  27. package/dist/cost-tracker.js.map +1 -1
  28. package/dist/coverage.js +39 -39
  29. package/dist/docs-sync.js +98 -98
  30. package/dist/evaluation.js +452 -452
  31. package/dist/fixed-footer.d.ts +7 -1
  32. package/dist/fixed-footer.js +92 -18
  33. package/dist/fixed-footer.js.map +1 -1
  34. package/dist/git-workflow.js +49 -49
  35. package/dist/index.d.ts +2 -0
  36. package/dist/index.js +197 -65
  37. package/dist/index.js.map +1 -1
  38. package/dist/instant-artifact.d.ts +6 -0
  39. package/dist/instant-artifact.js +397 -0
  40. package/dist/instant-artifact.js.map +1 -0
  41. package/dist/live-queue.js +1 -1
  42. package/dist/live-queue.js.map +1 -1
  43. package/dist/model-aliases.d.ts +37 -0
  44. package/dist/model-aliases.js +203 -0
  45. package/dist/model-aliases.js.map +1 -0
  46. package/dist/orchestration.js +15 -15
  47. package/dist/permissions.d.ts +6 -0
  48. package/dist/permissions.js +53 -0
  49. package/dist/permissions.js.map +1 -1
  50. package/dist/pm2-manager.js +26 -26
  51. package/dist/query.d.ts +0 -1
  52. package/dist/query.js +74 -39
  53. package/dist/query.js.map +1 -1
  54. package/dist/refactor.js +87 -87
  55. package/dist/repo-command.js +7 -1
  56. package/dist/repo-command.js.map +1 -1
  57. package/dist/search-first.js +92 -92
  58. package/dist/skill-create.js +100 -100
  59. package/dist/stitch.js +1 -1
  60. package/dist/system-prompt.d.ts +2 -1
  61. package/dist/system-prompt.js +10 -5
  62. package/dist/system-prompt.js.map +1 -1
  63. package/dist/tools/github-repo-digest.d.ts +1 -1
  64. package/dist/tools/github-repo-digest.js +38 -6
  65. package/dist/tools/github-repo-digest.js.map +1 -1
  66. package/dist/types.d.ts +3 -0
  67. package/dist/types.js.map +1 -1
  68. package/dist/verification.js +55 -55
  69. package/package.json +1 -1
  70. package/resources/__init__.py +1 -1
  71. package/resources/exgentic/cawdex_agent/README.md +114 -114
  72. package/resources/exgentic/cawdex_agent/__init__.py +5 -5
  73. package/resources/exgentic/cawdex_agent/agent.py +605 -605
  74. package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
  75. package/resources/exgentic/cawdex_agent/setup.sh +21 -21
  76. package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
  77. package/resources/hal/cawdex_agent/README.md +24 -24
  78. package/resources/hal/cawdex_agent/__init__.py +1 -1
  79. package/resources/hal/cawdex_agent/main.py +550 -550
  80. package/resources/hal/cawdex_agent/requirements.txt +2 -2
  81. package/resources/kbench/cawdex_agent/README.md +107 -107
  82. package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
  83. package/resources/kbench/cawdex_agent/runner.mjs +753 -753
  84. package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
  85. package/resources/terminal_bench/__init__.py +1 -1
  86. package/resources/terminal_bench/cawdex_agent.py +174 -174
  87. package/resources/terminal_bench/setup.sh +121 -121
@@ -1,550 +1,550 @@
1
- """HAL custom-agent adapter for Cawdex.
2
-
3
- HAL expects a module-level run(input, **kwargs) function. This adapter keeps
4
- Cawdex framework-agnostic by launching the installed CLI in headless
5
- benchmark mode, then returning the artifact shape expected by common HAL tasks.
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import json
11
- import os
12
- import re
13
- import shlex
14
- import subprocess
15
- from dataclasses import dataclass
16
- from pathlib import Path
17
- from typing import Any
18
-
19
-
20
- SECRET_REPLACEMENTS = [
21
- (re.compile(r"sk-or-v1-[A-Za-z0-9_-]+"), "sk-or-v1-[REDACTED]"),
22
- (re.compile(r"sk-[A-Za-z0-9_-]{16,}"), "sk-[REDACTED]"),
23
- (re.compile(r"hf_[A-Za-z0-9]{16,}"), "hf_[REDACTED]"),
24
- (re.compile(r"KGAT_[A-Za-z0-9]{16,}"), "KGAT_[REDACTED]"),
25
- (re.compile(r"npm_[A-Za-z0-9]{16,}"), "npm_[REDACTED]"),
26
- ]
27
-
28
- ORACLE_FIELD_RE = re.compile(
29
- r"(^|_)(patch|test_patch|solution|answer|gold|fail_to_pass|pass_to_pass)($|_)",
30
- re.IGNORECASE,
31
- )
32
-
33
- SAFE_FIELD_ORDER = [
34
- "instance_id",
35
- "task_id",
36
- "repo",
37
- "base_commit",
38
- "version",
39
- "created_at",
40
- "problem_statement",
41
- "hints_text",
42
- "description",
43
- "description_no_samples",
44
- "samples",
45
- "num_tests",
46
- "num_samples",
47
- "problem_link",
48
- "problem_level",
49
- "cp_id",
50
- "problem_id",
51
- "runtime_limit",
52
- "memory_limit",
53
- "runtime_limit_sentences",
54
- "memory_limit_sentences",
55
- "task_inst",
56
- "dataset_path",
57
- "dataset_folder_tree",
58
- "dataset_preview",
59
- "output_fname",
60
- "domain_knowledge",
61
- ]
62
-
63
-
64
- @dataclass
65
- class AgentRun:
66
- returncode: int
67
- stdout: str
68
- stderr: str
69
- trace_dir: Path
70
-
71
-
72
- def _redact(text: Any) -> str:
73
- value = str(text or "")
74
- for pattern, replacement in SECRET_REPLACEMENTS:
75
- value = pattern.sub(replacement, value)
76
- return value
77
-
78
-
79
- def _truncate(text: str, limit: int = 50000) -> str:
80
- clean = _redact(text)
81
- if len(clean) <= limit:
82
- return clean
83
- omitted = len(clean) - limit
84
- return clean[:limit] + f"\n...[truncated {omitted} chars]"
85
-
86
-
87
- def _safe_task_id(task_id: Any) -> str:
88
- raw = str(task_id or "task")
89
- safe = re.sub(r"[^A-Za-z0-9_.-]+", "-", raw).strip("-")
90
- return safe or "task"
91
-
92
-
93
- def _include_oracle_fields() -> bool:
94
- return os.environ.get("CAWDEX_HAL_INCLUDE_ORACLE_FIELDS", "").lower() in {"1", "true", "yes", "on"}
95
-
96
-
97
- def _safe_task_view(task: dict[str, Any]) -> tuple[dict[str, Any], list[str]]:
98
- if _include_oracle_fields():
99
- return task, []
100
-
101
- allowed: dict[str, Any] = {}
102
- omitted: list[str] = []
103
- ordered_keys = [key for key in SAFE_FIELD_ORDER if key in task]
104
- ordered_keys.extend(sorted(key for key in task if key not in ordered_keys))
105
- for key in ordered_keys:
106
- if ORACLE_FIELD_RE.search(key):
107
- omitted.append(key)
108
- continue
109
- allowed[key] = task[key]
110
- return allowed, omitted
111
-
112
-
113
- def _is_patch_task(task: dict[str, Any]) -> bool:
114
- return bool(
115
- task.get("problem_statement")
116
- and (task.get("repo") or task.get("base_commit") or task.get("instance_id"))
117
- )
118
-
119
-
120
- def _is_science_agent_task(task: dict[str, Any]) -> bool:
121
- return bool(
122
- task.get("task_inst")
123
- and (task.get("dataset_path") or task.get("output_fname") or task.get("dataset_folder_tree"))
124
- )
125
-
126
-
127
- def _is_appworld_task(task: dict[str, Any]) -> bool:
128
- keys = set(task.keys())
129
- return bool(task.get("task_id") and keys.issubset({"task_id", "instance_id"}))
130
-
131
-
132
- def _is_usaco_task(task: dict[str, Any]) -> bool:
133
- return bool(
134
- task.get("description")
135
- and (task.get("samples") or task.get("cp_id") or task.get("problem_id") or task.get("problem_link"))
136
- )
137
-
138
-
139
- def _profile_for_task(task: dict[str, Any]) -> str:
140
- task_text = json.dumps(task, ensure_ascii=False).lower()
141
- if _is_appworld_task(task) or "appworld" in task_text or "app-world" in task_text:
142
- return "appworld"
143
- if (
144
- "browsecomp" in task_text
145
- or "browsecomp+" in task_text
146
- or "browse-comp" in task_text
147
- or "deep research" in task_text
148
- or "web research" in task_text
149
- ):
150
- return "browsecomp"
151
- if (
152
- "tau2" in task_text
153
- or "tau 2" in task_text
154
- or "tau-bench" in task_text
155
- or "tau_bench" in task_text
156
- or "taubench" in task_text
157
- or "customer support" in task_text
158
- ):
159
- return "tau2"
160
- if (
161
- "roadmapbench" in task_text
162
- or "roadmap-bench" in task_text
163
- or "long-horizon" in task_text
164
- or "long horizon" in task_text
165
- or "version upgrade" in task_text
166
- or "multi-target" in task_text
167
- ):
168
- return "roadmapbench"
169
- if (
170
- "swe-cycle" in task_text
171
- or "swecycle" in task_text
172
- or "swe cycle" in task_text
173
- or "swe-judge" in task_text
174
- or "swejudge" in task_text
175
- or "fullcycle" in task_text
176
- or "codeimpl" in task_text
177
- or "testgen" in task_text
178
- or "run_script" in task_text
179
- or "parsing_script" in task_text
180
- or "selected_test_files_to_run" in task_text
181
- or "environment_setup_commit" in task_text
182
- or "before_repo_set_cmd" in task_text
183
- or "bare repository" in task_text
184
- ):
185
- return "swe-cycle"
186
- if (
187
- "swe-ci" in task_text
188
- or "sweci" in task_text
189
- or "swe ci" in task_text
190
- or "run_tests" in task_text
191
- or "define_requirements" in task_text
192
- or "modify_code" in task_text
193
- or "test gap" in task_text
194
- or "current_sha" in task_text
195
- or "target_sha" in task_text
196
- or ("codebase maintenance" in task_text and "continuous integration" in task_text)
197
- ):
198
- return "swe-ci"
199
- if (
200
- "swe-prbench" in task_text
201
- or "swe prbench" in task_text
202
- or "swe-pr" in task_text
203
- or "prbench" in task_text
204
- or "pull request review" in task_text
205
- or "code review quality" in task_text
206
- or "human_review_comments" in task_text
207
- or "diff_patch" in task_text
208
- or "type2_contextual" in task_text
209
- ):
210
- return "swe-prbench"
211
- if (
212
- "tml-bench" in task_text
213
- or "tmlbench" in task_text
214
- or "tabular ml" in task_text
215
- or "kaggle-style" in task_text
216
- or "kaggle style" in task_text
217
- or "sample_submission" in task_text
218
- or "private holdout" in task_text
219
- or ("train.csv" in task_text and "test.csv" in task_text and "submission" in task_text)
220
- ):
221
- return "tml-bench"
222
- if (
223
- "pi-bench" in task_text
224
- or "pibench" in task_text
225
- or "proactive personal assistant" in task_text
226
- or "proactive assistant" in task_text
227
- or "hidden intent" in task_text
228
- or "latent intent" in task_text
229
- or "user profile" in task_text
230
- or "message history" in task_text
231
- or "current app" in task_text
232
- or "proactivity score" in task_text
233
- or "completion score" in task_text
234
- ):
235
- return "pi-bench"
236
- if (
237
- "saasbench" in task_text
238
- or "saas-bench" in task_text
239
- or "enterprise saas" in task_text
240
- or "validation nodes" in task_text
241
- or "tenant" in task_text
242
- or "migration" in task_text
243
- ):
244
- return "saasbench"
245
- if (
246
- "swe-bench mobile" in task_text
247
- or "swebench mobile" in task_text
248
- or "xcode" in task_text
249
- or "swift" in task_text
250
- or "objective-c" in task_text
251
- or "figma" in task_text
252
- or "simulator" in task_text
253
- ):
254
- return "swe-bench-mobile"
255
- if (
256
- "swe-webdevbench" in task_text
257
- or "swe-webdev-bench" in task_text
258
- or "webdevbench" in task_text
259
- or "webdev-bench" in task_text
260
- or "vibe coding" in task_text
261
- or "virtual software agency" in task_text
262
- or "canary requirement" in task_text
263
- or "frontend-backend" in task_text
264
- or "production readiness" in task_text
265
- ):
266
- return "webdevbench"
267
- if _is_patch_task(task):
268
- return "swe-bench"
269
- if (
270
- "terminalworld" in task_text
271
- or "terminal-world" in task_text
272
- or "tw_" in task_text
273
- or "asciinema" in task_text
274
- ):
275
- return "terminalworld"
276
- if "terminal-bench" in task_text or "terminalbench" in task_text:
277
- return "terminal-bench"
278
- if (
279
- "wildclaw" in task_text
280
- or "openclaw" in task_text
281
- or "browsecomp" in task_text
282
- or "ossworld" in task_text
283
- or "bfcl" in task_text
284
- or "webwalkerqa" in task_text
285
- ):
286
- return "wildclaw"
287
- if (
288
- "arc-agi" in task_text
289
- or "arc_agi" in task_text
290
- or "arc prize" in task_text
291
- or "arc-prize" in task_text
292
- or "kaggle arc" in task_text
293
- ):
294
- return "arc-agi"
295
- if (
296
- "specbench" in task_text
297
- or "spec-bench" in task_text
298
- or "spec compliance" in task_text
299
- or "visible tests" in task_text
300
- or "held-out" in task_text
301
- or "holdout" in task_text
302
- ):
303
- return "specbench"
304
- if (
305
- "reward hacking benchmark" in task_text
306
- or "reward-hacking" in task_text
307
- or "reward_hacking" in task_text
308
- or "rhb" in task_text
309
- or "evaluator tamper" in task_text
310
- ):
311
- return "reward-hacking"
312
- return "generic"
313
-
314
-
315
- def _build_prompt(task_id: str, task: dict[str, Any]) -> str:
316
- profile = _profile_for_task(task)
317
- safe_task, omitted = _safe_task_view(task)
318
- body = json.dumps(safe_task, ensure_ascii=False, indent=2, sort_keys=True)
319
-
320
- lines = [
321
- f"/benchmark {profile} HAL task {task_id}",
322
- "",
323
- "You are running inside the Holistic Agent Leaderboard harness.",
324
- "Use Cawdex benchmark discipline: inspect local files, patch only what is needed, run targeted verification, and preserve trace evidence.",
325
- ]
326
- if profile == "swe-bench":
327
- lines.extend([
328
- "This is a SWE-bench-style patch task. Modify the checked-out repository; the HAL adapter will collect the git patch after the run.",
329
- "Do not edit tests or harness files unless the task explicitly asks for that.",
330
- ])
331
- elif profile == "terminalworld":
332
- lines.append("This is a TerminalWorld-style terminal workflow. Treat instruction.md/task text as the contract, avoid solve.sh/reference material, produce required persistent artifacts, and verify files/services in the environment.")
333
- elif _is_science_agent_task(task):
334
- lines.append("This is a ScienceAgentBench-style task. Produce a concise solution trajectory and any required output/program artifact in the final response.")
335
- elif profile == "appworld" or _is_appworld_task(task):
336
- lines.append("This is an AppWorld-style environment task. Interact with the environment as needed, then complete the task through the environment API.")
337
- elif profile == "browsecomp":
338
- lines.append("This is a BrowseComp+-style research task. Use source-grounded browsing/retrieval evidence, cross-check claims, and return the answer with auditable attribution.")
339
- elif profile == "tau2":
340
- lines.append("This is a tau2/Tau-Bench-style policy workflow. Follow the domain policy, use only available action schemas, and verify tool observations before completing.")
341
- elif profile == "webdevbench":
342
- lines.append("This is a SWE-WebDevBench-style full-stack app-agency task. Preserve canary business requirements, verify frontend-backend coupling, and collect production/security evidence when feasible.")
343
- elif profile == "swe-cycle":
344
- lines.append("This is a SWE-Cycle/SWE-Judge-style issue-resolution lifecycle task. Track environment setup, code implementation, verification-test generation when required, and post-edit static/dynamic judge evidence.")
345
- elif profile == "swe-ci":
346
- lines.append("This is a SWE-CI-style repository evolution task. Track current/target commits, test gaps, requirement derivation, and CI-loop validation across run_tests -> define_requirements -> modify_code.")
347
- elif profile == "swe-prbench":
348
- lines.append("This is a SWE-PRBench-style pull request review task. Inspect PR metadata and diff first, expand context only for concrete suspected issues, and return severity-rated review findings with file/line evidence instead of patching unless explicitly requested.")
349
- elif profile == "tml-bench":
350
- lines.append("This is a TML-Bench/Kaggle-style tabular ML task. Build the data contract first, avoid hidden-label leakage, train an honest baseline, and produce a sample_submission-compatible artifact with validation evidence.")
351
- elif profile == "pi-bench":
352
- lines.append("This is a Pi-Bench-style proactive personal assistant task. Build the user/workspace/app context contract, infer hidden intents carefully, ask one focused clarification when needed, and verify observable state after proactive actions.")
353
- elif _is_usaco_task(task):
354
- lines.append("This is a USACO-style programming task. Produce the final code solution in the final response.")
355
- else:
356
- lines.append("Return the final task response clearly; the HAL adapter will store it in the task response field.")
357
-
358
- if omitted:
359
- lines.append("Oracle-like task fields omitted from the prompt by default: " + ", ".join(sorted(omitted)) + ".")
360
-
361
- lines.extend(["", "## HAL task data", _truncate(body)])
362
- return "\n".join(lines)
363
-
364
-
365
- def _base_command() -> list[str]:
366
- command = os.environ.get("CAWDEX_HAL_COMMAND") or os.environ.get("CAWDEX_HAL_COMMAND", "cawdex")
367
- parts = shlex.split(command, posix=os.name != "nt")
368
- return parts or ["cawdex"]
369
-
370
-
371
- def _append_flag(args: list[str], flag: str, value: Any) -> None:
372
- if value is None:
373
- return
374
- text = str(value).strip()
375
- if not text:
376
- return
377
- args.extend([flag, text])
378
-
379
-
380
- def _run_cawdex(task_id: str, prompt: str, kwargs: dict[str, Any]) -> AgentRun:
381
- trace_root = Path(os.environ.get("CAWDEX_HAL_TRACE_DIR", ".cawdex/hal-trace"))
382
- trace_dir = trace_root / _safe_task_id(task_id)
383
- trace_dir.mkdir(parents=True, exist_ok=True)
384
-
385
- env = os.environ.copy()
386
- env.setdefault("CAWDEX_ENV_CONFIG", "1")
387
- env.setdefault("CAWDEX_THEME", "minimal")
388
- env.setdefault("CAWDEX_SHOW_THINKING", "0")
389
- env.setdefault("CAWDEX_MEMORY", "0")
390
- env.setdefault("CAWDEX_BASH_TIMEOUT_MS", "300000")
391
-
392
- args = _base_command()
393
- args.extend([
394
- "--prompt",
395
- prompt,
396
- "--perm",
397
- "yolo",
398
- "--benchmark-trace-dir",
399
- str(trace_dir),
400
- ])
401
- _append_flag(args, "--model", kwargs.get("model_name") or kwargs.get("model"))
402
- _append_flag(args, "--provider", kwargs.get("provider"))
403
- _append_flag(args, "--max-turns", kwargs.get("max_turns"))
404
- _append_flag(args, "--max-tokens", kwargs.get("max_tokens"))
405
- _append_flag(args, "--temperature", kwargs.get("temperature"))
406
- _append_flag(args, "--output-format", kwargs.get("output_format"))
407
-
408
- timeout = int(os.environ.get("CAWDEX_HAL_TIMEOUT_SEC", "1800"))
409
- try:
410
- completed = subprocess.run(
411
- args,
412
- cwd=os.getcwd(),
413
- env=env,
414
- text=True,
415
- capture_output=True,
416
- timeout=timeout,
417
- check=False,
418
- )
419
- stdout = _redact(completed.stdout)
420
- stderr = _redact(completed.stderr)
421
- returncode = completed.returncode
422
- except subprocess.TimeoutExpired as exc:
423
- stdout = _redact(exc.stdout)
424
- stderr = _redact(exc.stderr) + f"\nCawdex timed out after {timeout}s"
425
- returncode = 124
426
-
427
- (trace_dir / "hal-stdout.txt").write_text(stdout, encoding="utf-8")
428
- (trace_dir / "hal-stderr.txt").write_text(stderr, encoding="utf-8")
429
- return AgentRun(returncode=returncode, stdout=stdout, stderr=stderr, trace_dir=trace_dir)
430
-
431
-
432
- def _run_git(args: list[str], cwd: Path | None = None) -> str:
433
- try:
434
- completed = subprocess.run(
435
- ["git", *args],
436
- cwd=str(cwd) if cwd else None,
437
- text=True,
438
- capture_output=True,
439
- check=False,
440
- timeout=60,
441
- )
442
- except Exception:
443
- return ""
444
- if completed.returncode not in {0, 1}:
445
- return ""
446
- return _redact(completed.stdout)
447
-
448
-
449
- def _latest_trace_patch(trace_dir: Path) -> str:
450
- patches = sorted(
451
- trace_dir.rglob("worktree.patch"),
452
- key=lambda path: path.stat().st_mtime,
453
- reverse=True,
454
- )
455
- for patch in patches:
456
- try:
457
- text = _redact(patch.read_text(encoding="utf-8", errors="replace"))
458
- if text.strip():
459
- return text
460
- except OSError:
461
- continue
462
- return ""
463
-
464
-
465
- def _collect_git_patch(trace_dir: Path) -> str:
466
- trace_patch = _latest_trace_patch(trace_dir)
467
- if trace_patch:
468
- return trace_patch
469
-
470
- parts = [
471
- _run_git(["diff", "--binary", "--no-ext-diff"]),
472
- _run_git(["diff", "--cached", "--binary", "--no-ext-diff"]),
473
- ]
474
- if os.name != "nt":
475
- raw_untracked = _run_git(["ls-files", "--others", "--exclude-standard", "-z"])
476
- for filename in raw_untracked.split("\0"):
477
- if filename:
478
- parts.append(_run_git(["diff", "--no-index", "--binary", "--no-ext-diff", "--", "/dev/null", filename]))
479
- return "".join(part for part in parts if part)
480
-
481
-
482
- def _latest_summary(trace_dir: Path) -> dict[str, Any]:
483
- summaries = sorted(
484
- trace_dir.rglob("summary.json"),
485
- key=lambda path: path.stat().st_mtime,
486
- reverse=True,
487
- )
488
- for summary in summaries:
489
- try:
490
- return json.loads(summary.read_text(encoding="utf-8"))
491
- except Exception:
492
- continue
493
- return {}
494
-
495
-
496
- def _response_text(run_result: AgentRun) -> str:
497
- summary = _latest_summary(run_result.trace_dir)
498
- final = summary.get("finalAssistant")
499
- if isinstance(final, str) and final.strip():
500
- return _truncate(final, 20000)
501
- combined = "\n".join(part for part in [run_result.stdout, run_result.stderr] if part)
502
- return _truncate(combined, 20000)
503
-
504
-
505
- def _submission_for_task(task: dict[str, Any], run_result: AgentRun) -> Any:
506
- response = _response_text(run_result)
507
- if _is_science_agent_task(task):
508
- return response
509
- if _is_appworld_task(task):
510
- return "Completed" if run_result.returncode == 0 else response
511
-
512
- updated = dict(task)
513
- updated["response"] = response
514
- return updated
515
-
516
-
517
- def run(input: dict[str, dict[str, Any]], **kwargs: Any) -> dict[str, Any]:
518
- """Run Cawdex for HAL.
519
-
520
- Patch-style tasks return {task_id: patch}. ScienceAgentBench-style tasks
521
- return a trajectory string. AppWorld-style tasks return "Completed" after
522
- a successful run. Other text/code tasks return the original task dict with
523
- a response field, matching HAL's USACO-style pattern.
524
- """
525
- if not isinstance(input, dict):
526
- raise TypeError("Cawdex HAL adapter expects input to be a dictionary")
527
-
528
- patch_task_ids = [
529
- str(task_id)
530
- for task_id, task in input.items()
531
- if isinstance(task, dict) and _is_patch_task(task)
532
- ]
533
- if len(patch_task_ids) > 1:
534
- raise ValueError("Cawdex HAL adapter expects one patch-style task per checked-out worktree")
535
-
536
- output: dict[str, Any] = {}
537
- for task_id, task in input.items():
538
- if not isinstance(task, dict):
539
- output[str(task_id)] = task
540
- continue
541
-
542
- prompt = _build_prompt(str(task_id), task)
543
- run_result = _run_cawdex(str(task_id), prompt, kwargs)
544
-
545
- if _is_patch_task(task):
546
- output[str(task_id)] = _collect_git_patch(run_result.trace_dir)
547
- else:
548
- output[str(task_id)] = _submission_for_task(task, run_result)
549
-
550
- return output
1
+ """HAL custom-agent adapter for Cawdex.
2
+
3
+ HAL expects a module-level run(input, **kwargs) function. This adapter keeps
4
+ Cawdex framework-agnostic by launching the installed CLI in headless
5
+ benchmark mode, then returning the artifact shape expected by common HAL tasks.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import re
13
+ import shlex
14
+ import subprocess
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+
20
+ SECRET_REPLACEMENTS = [
21
+ (re.compile(r"sk-or-v1-[A-Za-z0-9_-]+"), "sk-or-v1-[REDACTED]"),
22
+ (re.compile(r"sk-[A-Za-z0-9_-]{16,}"), "sk-[REDACTED]"),
23
+ (re.compile(r"hf_[A-Za-z0-9]{16,}"), "hf_[REDACTED]"),
24
+ (re.compile(r"KGAT_[A-Za-z0-9]{16,}"), "KGAT_[REDACTED]"),
25
+ (re.compile(r"npm_[A-Za-z0-9]{16,}"), "npm_[REDACTED]"),
26
+ ]
27
+
28
+ ORACLE_FIELD_RE = re.compile(
29
+ r"(^|_)(patch|test_patch|solution|answer|gold|fail_to_pass|pass_to_pass)($|_)",
30
+ re.IGNORECASE,
31
+ )
32
+
33
+ SAFE_FIELD_ORDER = [
34
+ "instance_id",
35
+ "task_id",
36
+ "repo",
37
+ "base_commit",
38
+ "version",
39
+ "created_at",
40
+ "problem_statement",
41
+ "hints_text",
42
+ "description",
43
+ "description_no_samples",
44
+ "samples",
45
+ "num_tests",
46
+ "num_samples",
47
+ "problem_link",
48
+ "problem_level",
49
+ "cp_id",
50
+ "problem_id",
51
+ "runtime_limit",
52
+ "memory_limit",
53
+ "runtime_limit_sentences",
54
+ "memory_limit_sentences",
55
+ "task_inst",
56
+ "dataset_path",
57
+ "dataset_folder_tree",
58
+ "dataset_preview",
59
+ "output_fname",
60
+ "domain_knowledge",
61
+ ]
62
+
63
+
64
+ @dataclass
65
+ class AgentRun:
66
+ returncode: int
67
+ stdout: str
68
+ stderr: str
69
+ trace_dir: Path
70
+
71
+
72
+ def _redact(text: Any) -> str:
73
+ value = str(text or "")
74
+ for pattern, replacement in SECRET_REPLACEMENTS:
75
+ value = pattern.sub(replacement, value)
76
+ return value
77
+
78
+
79
+ def _truncate(text: str, limit: int = 50000) -> str:
80
+ clean = _redact(text)
81
+ if len(clean) <= limit:
82
+ return clean
83
+ omitted = len(clean) - limit
84
+ return clean[:limit] + f"\n...[truncated {omitted} chars]"
85
+
86
+
87
+ def _safe_task_id(task_id: Any) -> str:
88
+ raw = str(task_id or "task")
89
+ safe = re.sub(r"[^A-Za-z0-9_.-]+", "-", raw).strip("-")
90
+ return safe or "task"
91
+
92
+
93
+ def _include_oracle_fields() -> bool:
94
+ return os.environ.get("CAWDEX_HAL_INCLUDE_ORACLE_FIELDS", "").lower() in {"1", "true", "yes", "on"}
95
+
96
+
97
+ def _safe_task_view(task: dict[str, Any]) -> tuple[dict[str, Any], list[str]]:
98
+ if _include_oracle_fields():
99
+ return task, []
100
+
101
+ allowed: dict[str, Any] = {}
102
+ omitted: list[str] = []
103
+ ordered_keys = [key for key in SAFE_FIELD_ORDER if key in task]
104
+ ordered_keys.extend(sorted(key for key in task if key not in ordered_keys))
105
+ for key in ordered_keys:
106
+ if ORACLE_FIELD_RE.search(key):
107
+ omitted.append(key)
108
+ continue
109
+ allowed[key] = task[key]
110
+ return allowed, omitted
111
+
112
+
113
+ def _is_patch_task(task: dict[str, Any]) -> bool:
114
+ return bool(
115
+ task.get("problem_statement")
116
+ and (task.get("repo") or task.get("base_commit") or task.get("instance_id"))
117
+ )
118
+
119
+
120
+ def _is_science_agent_task(task: dict[str, Any]) -> bool:
121
+ return bool(
122
+ task.get("task_inst")
123
+ and (task.get("dataset_path") or task.get("output_fname") or task.get("dataset_folder_tree"))
124
+ )
125
+
126
+
127
+ def _is_appworld_task(task: dict[str, Any]) -> bool:
128
+ keys = set(task.keys())
129
+ return bool(task.get("task_id") and keys.issubset({"task_id", "instance_id"}))
130
+
131
+
132
+ def _is_usaco_task(task: dict[str, Any]) -> bool:
133
+ return bool(
134
+ task.get("description")
135
+ and (task.get("samples") or task.get("cp_id") or task.get("problem_id") or task.get("problem_link"))
136
+ )
137
+
138
+
139
+ def _profile_for_task(task: dict[str, Any]) -> str:
140
+ task_text = json.dumps(task, ensure_ascii=False).lower()
141
+ if _is_appworld_task(task) or "appworld" in task_text or "app-world" in task_text:
142
+ return "appworld"
143
+ if (
144
+ "browsecomp" in task_text
145
+ or "browsecomp+" in task_text
146
+ or "browse-comp" in task_text
147
+ or "deep research" in task_text
148
+ or "web research" in task_text
149
+ ):
150
+ return "browsecomp"
151
+ if (
152
+ "tau2" in task_text
153
+ or "tau 2" in task_text
154
+ or "tau-bench" in task_text
155
+ or "tau_bench" in task_text
156
+ or "taubench" in task_text
157
+ or "customer support" in task_text
158
+ ):
159
+ return "tau2"
160
+ if (
161
+ "roadmapbench" in task_text
162
+ or "roadmap-bench" in task_text
163
+ or "long-horizon" in task_text
164
+ or "long horizon" in task_text
165
+ or "version upgrade" in task_text
166
+ or "multi-target" in task_text
167
+ ):
168
+ return "roadmapbench"
169
+ if (
170
+ "swe-cycle" in task_text
171
+ or "swecycle" in task_text
172
+ or "swe cycle" in task_text
173
+ or "swe-judge" in task_text
174
+ or "swejudge" in task_text
175
+ or "fullcycle" in task_text
176
+ or "codeimpl" in task_text
177
+ or "testgen" in task_text
178
+ or "run_script" in task_text
179
+ or "parsing_script" in task_text
180
+ or "selected_test_files_to_run" in task_text
181
+ or "environment_setup_commit" in task_text
182
+ or "before_repo_set_cmd" in task_text
183
+ or "bare repository" in task_text
184
+ ):
185
+ return "swe-cycle"
186
+ if (
187
+ "swe-ci" in task_text
188
+ or "sweci" in task_text
189
+ or "swe ci" in task_text
190
+ or "run_tests" in task_text
191
+ or "define_requirements" in task_text
192
+ or "modify_code" in task_text
193
+ or "test gap" in task_text
194
+ or "current_sha" in task_text
195
+ or "target_sha" in task_text
196
+ or ("codebase maintenance" in task_text and "continuous integration" in task_text)
197
+ ):
198
+ return "swe-ci"
199
+ if (
200
+ "swe-prbench" in task_text
201
+ or "swe prbench" in task_text
202
+ or "swe-pr" in task_text
203
+ or "prbench" in task_text
204
+ or "pull request review" in task_text
205
+ or "code review quality" in task_text
206
+ or "human_review_comments" in task_text
207
+ or "diff_patch" in task_text
208
+ or "type2_contextual" in task_text
209
+ ):
210
+ return "swe-prbench"
211
+ if (
212
+ "tml-bench" in task_text
213
+ or "tmlbench" in task_text
214
+ or "tabular ml" in task_text
215
+ or "kaggle-style" in task_text
216
+ or "kaggle style" in task_text
217
+ or "sample_submission" in task_text
218
+ or "private holdout" in task_text
219
+ or ("train.csv" in task_text and "test.csv" in task_text and "submission" in task_text)
220
+ ):
221
+ return "tml-bench"
222
+ if (
223
+ "pi-bench" in task_text
224
+ or "pibench" in task_text
225
+ or "proactive personal assistant" in task_text
226
+ or "proactive assistant" in task_text
227
+ or "hidden intent" in task_text
228
+ or "latent intent" in task_text
229
+ or "user profile" in task_text
230
+ or "message history" in task_text
231
+ or "current app" in task_text
232
+ or "proactivity score" in task_text
233
+ or "completion score" in task_text
234
+ ):
235
+ return "pi-bench"
236
+ if (
237
+ "saasbench" in task_text
238
+ or "saas-bench" in task_text
239
+ or "enterprise saas" in task_text
240
+ or "validation nodes" in task_text
241
+ or "tenant" in task_text
242
+ or "migration" in task_text
243
+ ):
244
+ return "saasbench"
245
+ if (
246
+ "swe-bench mobile" in task_text
247
+ or "swebench mobile" in task_text
248
+ or "xcode" in task_text
249
+ or "swift" in task_text
250
+ or "objective-c" in task_text
251
+ or "figma" in task_text
252
+ or "simulator" in task_text
253
+ ):
254
+ return "swe-bench-mobile"
255
+ if (
256
+ "swe-webdevbench" in task_text
257
+ or "swe-webdev-bench" in task_text
258
+ or "webdevbench" in task_text
259
+ or "webdev-bench" in task_text
260
+ or "vibe coding" in task_text
261
+ or "virtual software agency" in task_text
262
+ or "canary requirement" in task_text
263
+ or "frontend-backend" in task_text
264
+ or "production readiness" in task_text
265
+ ):
266
+ return "webdevbench"
267
+ if _is_patch_task(task):
268
+ return "swe-bench"
269
+ if (
270
+ "terminalworld" in task_text
271
+ or "terminal-world" in task_text
272
+ or "tw_" in task_text
273
+ or "asciinema" in task_text
274
+ ):
275
+ return "terminalworld"
276
+ if "terminal-bench" in task_text or "terminalbench" in task_text:
277
+ return "terminal-bench"
278
+ if (
279
+ "wildclaw" in task_text
280
+ or "openclaw" in task_text
281
+ or "browsecomp" in task_text
282
+ or "ossworld" in task_text
283
+ or "bfcl" in task_text
284
+ or "webwalkerqa" in task_text
285
+ ):
286
+ return "wildclaw"
287
+ if (
288
+ "arc-agi" in task_text
289
+ or "arc_agi" in task_text
290
+ or "arc prize" in task_text
291
+ or "arc-prize" in task_text
292
+ or "kaggle arc" in task_text
293
+ ):
294
+ return "arc-agi"
295
+ if (
296
+ "specbench" in task_text
297
+ or "spec-bench" in task_text
298
+ or "spec compliance" in task_text
299
+ or "visible tests" in task_text
300
+ or "held-out" in task_text
301
+ or "holdout" in task_text
302
+ ):
303
+ return "specbench"
304
+ if (
305
+ "reward hacking benchmark" in task_text
306
+ or "reward-hacking" in task_text
307
+ or "reward_hacking" in task_text
308
+ or "rhb" in task_text
309
+ or "evaluator tamper" in task_text
310
+ ):
311
+ return "reward-hacking"
312
+ return "generic"
313
+
314
+
315
+ def _build_prompt(task_id: str, task: dict[str, Any]) -> str:
316
+ profile = _profile_for_task(task)
317
+ safe_task, omitted = _safe_task_view(task)
318
+ body = json.dumps(safe_task, ensure_ascii=False, indent=2, sort_keys=True)
319
+
320
+ lines = [
321
+ f"/benchmark {profile} HAL task {task_id}",
322
+ "",
323
+ "You are running inside the Holistic Agent Leaderboard harness.",
324
+ "Use Cawdex benchmark discipline: inspect local files, patch only what is needed, run targeted verification, and preserve trace evidence.",
325
+ ]
326
+ if profile == "swe-bench":
327
+ lines.extend([
328
+ "This is a SWE-bench-style patch task. Modify the checked-out repository; the HAL adapter will collect the git patch after the run.",
329
+ "Do not edit tests or harness files unless the task explicitly asks for that.",
330
+ ])
331
+ elif profile == "terminalworld":
332
+ lines.append("This is a TerminalWorld-style terminal workflow. Treat instruction.md/task text as the contract, avoid solve.sh/reference material, produce required persistent artifacts, and verify files/services in the environment.")
333
+ elif _is_science_agent_task(task):
334
+ lines.append("This is a ScienceAgentBench-style task. Produce a concise solution trajectory and any required output/program artifact in the final response.")
335
+ elif profile == "appworld" or _is_appworld_task(task):
336
+ lines.append("This is an AppWorld-style environment task. Interact with the environment as needed, then complete the task through the environment API.")
337
+ elif profile == "browsecomp":
338
+ lines.append("This is a BrowseComp+-style research task. Use source-grounded browsing/retrieval evidence, cross-check claims, and return the answer with auditable attribution.")
339
+ elif profile == "tau2":
340
+ lines.append("This is a tau2/Tau-Bench-style policy workflow. Follow the domain policy, use only available action schemas, and verify tool observations before completing.")
341
+ elif profile == "webdevbench":
342
+ lines.append("This is a SWE-WebDevBench-style full-stack app-agency task. Preserve canary business requirements, verify frontend-backend coupling, and collect production/security evidence when feasible.")
343
+ elif profile == "swe-cycle":
344
+ lines.append("This is a SWE-Cycle/SWE-Judge-style issue-resolution lifecycle task. Track environment setup, code implementation, verification-test generation when required, and post-edit static/dynamic judge evidence.")
345
+ elif profile == "swe-ci":
346
+ lines.append("This is a SWE-CI-style repository evolution task. Track current/target commits, test gaps, requirement derivation, and CI-loop validation across run_tests -> define_requirements -> modify_code.")
347
+ elif profile == "swe-prbench":
348
+ lines.append("This is a SWE-PRBench-style pull request review task. Inspect PR metadata and diff first, expand context only for concrete suspected issues, and return severity-rated review findings with file/line evidence instead of patching unless explicitly requested.")
349
+ elif profile == "tml-bench":
350
+ lines.append("This is a TML-Bench/Kaggle-style tabular ML task. Build the data contract first, avoid hidden-label leakage, train an honest baseline, and produce a sample_submission-compatible artifact with validation evidence.")
351
+ elif profile == "pi-bench":
352
+ lines.append("This is a Pi-Bench-style proactive personal assistant task. Build the user/workspace/app context contract, infer hidden intents carefully, ask one focused clarification when needed, and verify observable state after proactive actions.")
353
+ elif _is_usaco_task(task):
354
+ lines.append("This is a USACO-style programming task. Produce the final code solution in the final response.")
355
+ else:
356
+ lines.append("Return the final task response clearly; the HAL adapter will store it in the task response field.")
357
+
358
+ if omitted:
359
+ lines.append("Oracle-like task fields omitted from the prompt by default: " + ", ".join(sorted(omitted)) + ".")
360
+
361
+ lines.extend(["", "## HAL task data", _truncate(body)])
362
+ return "\n".join(lines)
363
+
364
+
365
+ def _base_command() -> list[str]:
366
+ command = os.environ.get("CAWDEX_HAL_COMMAND") or os.environ.get("CAWDEX_HAL_COMMAND", "cawdex")
367
+ parts = shlex.split(command, posix=os.name != "nt")
368
+ return parts or ["cawdex"]
369
+
370
+
371
+ def _append_flag(args: list[str], flag: str, value: Any) -> None:
372
+ if value is None:
373
+ return
374
+ text = str(value).strip()
375
+ if not text:
376
+ return
377
+ args.extend([flag, text])
378
+
379
+
380
+ def _run_cawdex(task_id: str, prompt: str, kwargs: dict[str, Any]) -> AgentRun:
381
+ trace_root = Path(os.environ.get("CAWDEX_HAL_TRACE_DIR", ".cawdex/hal-trace"))
382
+ trace_dir = trace_root / _safe_task_id(task_id)
383
+ trace_dir.mkdir(parents=True, exist_ok=True)
384
+
385
+ env = os.environ.copy()
386
+ env.setdefault("CAWDEX_ENV_CONFIG", "1")
387
+ env.setdefault("CAWDEX_THEME", "minimal")
388
+ env.setdefault("CAWDEX_SHOW_THINKING", "0")
389
+ env.setdefault("CAWDEX_MEMORY", "0")
390
+ env.setdefault("CAWDEX_BASH_TIMEOUT_MS", "300000")
391
+
392
+ args = _base_command()
393
+ args.extend([
394
+ "--prompt",
395
+ prompt,
396
+ "--perm",
397
+ "yolo",
398
+ "--benchmark-trace-dir",
399
+ str(trace_dir),
400
+ ])
401
+ _append_flag(args, "--model", kwargs.get("model_name") or kwargs.get("model"))
402
+ _append_flag(args, "--provider", kwargs.get("provider"))
403
+ _append_flag(args, "--max-turns", kwargs.get("max_turns"))
404
+ _append_flag(args, "--max-tokens", kwargs.get("max_tokens"))
405
+ _append_flag(args, "--temperature", kwargs.get("temperature"))
406
+ _append_flag(args, "--output-format", kwargs.get("output_format"))
407
+
408
+ timeout = int(os.environ.get("CAWDEX_HAL_TIMEOUT_SEC", "1800"))
409
+ try:
410
+ completed = subprocess.run(
411
+ args,
412
+ cwd=os.getcwd(),
413
+ env=env,
414
+ text=True,
415
+ capture_output=True,
416
+ timeout=timeout,
417
+ check=False,
418
+ )
419
+ stdout = _redact(completed.stdout)
420
+ stderr = _redact(completed.stderr)
421
+ returncode = completed.returncode
422
+ except subprocess.TimeoutExpired as exc:
423
+ stdout = _redact(exc.stdout)
424
+ stderr = _redact(exc.stderr) + f"\nCawdex timed out after {timeout}s"
425
+ returncode = 124
426
+
427
+ (trace_dir / "hal-stdout.txt").write_text(stdout, encoding="utf-8")
428
+ (trace_dir / "hal-stderr.txt").write_text(stderr, encoding="utf-8")
429
+ return AgentRun(returncode=returncode, stdout=stdout, stderr=stderr, trace_dir=trace_dir)
430
+
431
+
432
+ def _run_git(args: list[str], cwd: Path | None = None) -> str:
433
+ try:
434
+ completed = subprocess.run(
435
+ ["git", *args],
436
+ cwd=str(cwd) if cwd else None,
437
+ text=True,
438
+ capture_output=True,
439
+ check=False,
440
+ timeout=60,
441
+ )
442
+ except Exception:
443
+ return ""
444
+ if completed.returncode not in {0, 1}:
445
+ return ""
446
+ return _redact(completed.stdout)
447
+
448
+
449
+ def _latest_trace_patch(trace_dir: Path) -> str:
450
+ patches = sorted(
451
+ trace_dir.rglob("worktree.patch"),
452
+ key=lambda path: path.stat().st_mtime,
453
+ reverse=True,
454
+ )
455
+ for patch in patches:
456
+ try:
457
+ text = _redact(patch.read_text(encoding="utf-8", errors="replace"))
458
+ if text.strip():
459
+ return text
460
+ except OSError:
461
+ continue
462
+ return ""
463
+
464
+
465
+ def _collect_git_patch(trace_dir: Path) -> str:
466
+ trace_patch = _latest_trace_patch(trace_dir)
467
+ if trace_patch:
468
+ return trace_patch
469
+
470
+ parts = [
471
+ _run_git(["diff", "--binary", "--no-ext-diff"]),
472
+ _run_git(["diff", "--cached", "--binary", "--no-ext-diff"]),
473
+ ]
474
+ if os.name != "nt":
475
+ raw_untracked = _run_git(["ls-files", "--others", "--exclude-standard", "-z"])
476
+ for filename in raw_untracked.split("\0"):
477
+ if filename:
478
+ parts.append(_run_git(["diff", "--no-index", "--binary", "--no-ext-diff", "--", "/dev/null", filename]))
479
+ return "".join(part for part in parts if part)
480
+
481
+
482
+ def _latest_summary(trace_dir: Path) -> dict[str, Any]:
483
+ summaries = sorted(
484
+ trace_dir.rglob("summary.json"),
485
+ key=lambda path: path.stat().st_mtime,
486
+ reverse=True,
487
+ )
488
+ for summary in summaries:
489
+ try:
490
+ return json.loads(summary.read_text(encoding="utf-8"))
491
+ except Exception:
492
+ continue
493
+ return {}
494
+
495
+
496
+ def _response_text(run_result: AgentRun) -> str:
497
+ summary = _latest_summary(run_result.trace_dir)
498
+ final = summary.get("finalAssistant")
499
+ if isinstance(final, str) and final.strip():
500
+ return _truncate(final, 20000)
501
+ combined = "\n".join(part for part in [run_result.stdout, run_result.stderr] if part)
502
+ return _truncate(combined, 20000)
503
+
504
+
505
+ def _submission_for_task(task: dict[str, Any], run_result: AgentRun) -> Any:
506
+ response = _response_text(run_result)
507
+ if _is_science_agent_task(task):
508
+ return response
509
+ if _is_appworld_task(task):
510
+ return "Completed" if run_result.returncode == 0 else response
511
+
512
+ updated = dict(task)
513
+ updated["response"] = response
514
+ return updated
515
+
516
+
517
+ def run(input: dict[str, dict[str, Any]], **kwargs: Any) -> dict[str, Any]:
518
+ """Run Cawdex for HAL.
519
+
520
+ Patch-style tasks return {task_id: patch}. ScienceAgentBench-style tasks
521
+ return a trajectory string. AppWorld-style tasks return "Completed" after
522
+ a successful run. Other text/code tasks return the original task dict with
523
+ a response field, matching HAL's USACO-style pattern.
524
+ """
525
+ if not isinstance(input, dict):
526
+ raise TypeError("Cawdex HAL adapter expects input to be a dictionary")
527
+
528
+ patch_task_ids = [
529
+ str(task_id)
530
+ for task_id, task in input.items()
531
+ if isinstance(task, dict) and _is_patch_task(task)
532
+ ]
533
+ if len(patch_task_ids) > 1:
534
+ raise ValueError("Cawdex HAL adapter expects one patch-style task per checked-out worktree")
535
+
536
+ output: dict[str, Any] = {}
537
+ for task_id, task in input.items():
538
+ if not isinstance(task, dict):
539
+ output[str(task_id)] = task
540
+ continue
541
+
542
+ prompt = _build_prompt(str(task_id), task)
543
+ run_result = _run_cawdex(str(task_id), prompt, kwargs)
544
+
545
+ if _is_patch_task(task):
546
+ output[str(task_id)] = _collect_git_patch(run_result.trace_dir)
547
+ else:
548
+ output[str(task_id)] = _submission_for_task(task, run_result)
549
+
550
+ return output