cawdex 1.35.75 → 1.35.76
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/bin/anycode.js +2 -2
- package/bin/cawdex.js +408 -408
- package/bin/ecc-hooks.cjs +11 -11
- package/dist/agents-md.d.ts +31 -0
- package/dist/agents-md.js +340 -0
- package/dist/agents-md.js.map +1 -0
- package/dist/agents.js +1424 -1424
- package/dist/api.d.ts +1 -0
- package/dist/api.js +19 -14
- package/dist/api.js.map +1 -1
- package/dist/autonomous-loops.js +287 -287
- package/dist/benchmark-repos.d.ts +31 -0
- package/dist/benchmark-repos.js +234 -8
- package/dist/benchmark-repos.js.map +1 -1
- package/dist/command-palette.js +4 -2
- package/dist/command-palette.js.map +1 -1
- package/dist/compaction.js +8 -8
- package/dist/config.js +51 -36
- package/dist/config.js.map +1 -1
- package/dist/content-engine.js +543 -543
- package/dist/context-brief.d.ts +4 -0
- package/dist/context-brief.js +230 -0
- package/dist/context-brief.js.map +1 -0
- package/dist/cost-tracker.d.ts +33 -14
- package/dist/cost-tracker.js +81 -19
- package/dist/cost-tracker.js.map +1 -1
- package/dist/coverage.js +39 -39
- package/dist/docs-sync.js +98 -98
- package/dist/evaluation.js +452 -452
- package/dist/fixed-footer.d.ts +7 -1
- package/dist/fixed-footer.js +92 -18
- package/dist/fixed-footer.js.map +1 -1
- package/dist/git-workflow.js +49 -49
- package/dist/index.d.ts +2 -0
- package/dist/index.js +161 -63
- package/dist/index.js.map +1 -1
- package/dist/live-queue.js +1 -1
- package/dist/live-queue.js.map +1 -1
- package/dist/model-aliases.d.ts +37 -0
- package/dist/model-aliases.js +203 -0
- package/dist/model-aliases.js.map +1 -0
- package/dist/orchestration.js +15 -15
- package/dist/permissions.d.ts +6 -0
- package/dist/permissions.js +53 -0
- package/dist/permissions.js.map +1 -1
- package/dist/pm2-manager.js +26 -26
- package/dist/query.d.ts +0 -1
- package/dist/query.js +74 -39
- package/dist/query.js.map +1 -1
- package/dist/refactor.js +87 -87
- package/dist/repo-command.js +7 -1
- package/dist/repo-command.js.map +1 -1
- package/dist/search-first.js +92 -92
- package/dist/skill-create.js +100 -100
- package/dist/stitch.js +1 -1
- package/dist/system-prompt.d.ts +2 -1
- package/dist/system-prompt.js +10 -5
- package/dist/system-prompt.js.map +1 -1
- package/dist/tools/github-repo-digest.d.ts +1 -1
- package/dist/tools/github-repo-digest.js +38 -6
- package/dist/tools/github-repo-digest.js.map +1 -1
- package/dist/types.d.ts +3 -0
- package/dist/types.js.map +1 -1
- package/dist/verification.js +55 -55
- package/package.json +1 -1
- package/resources/__init__.py +1 -1
- package/resources/exgentic/cawdex_agent/README.md +114 -114
- package/resources/exgentic/cawdex_agent/__init__.py +5 -5
- package/resources/exgentic/cawdex_agent/agent.py +605 -605
- package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
- package/resources/exgentic/cawdex_agent/setup.sh +21 -21
- package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
- package/resources/hal/cawdex_agent/README.md +24 -24
- package/resources/hal/cawdex_agent/__init__.py +1 -1
- package/resources/hal/cawdex_agent/main.py +550 -550
- package/resources/hal/cawdex_agent/requirements.txt +2 -2
- package/resources/kbench/cawdex_agent/README.md +107 -107
- package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
- package/resources/kbench/cawdex_agent/runner.mjs +753 -753
- package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
- package/resources/terminal_bench/__init__.py +1 -1
- package/resources/terminal_bench/cawdex_agent.py +174 -174
- package/resources/terminal_bench/setup.sh +121 -121
|
@@ -1,550 +1,550 @@
|
|
|
1
|
-
"""HAL custom-agent adapter for Cawdex.
|
|
2
|
-
|
|
3
|
-
HAL expects a module-level run(input, **kwargs) function. This adapter keeps
|
|
4
|
-
Cawdex framework-agnostic by launching the installed CLI in headless
|
|
5
|
-
benchmark mode, then returning the artifact shape expected by common HAL tasks.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from __future__ import annotations
|
|
9
|
-
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
import re
|
|
13
|
-
import shlex
|
|
14
|
-
import subprocess
|
|
15
|
-
from dataclasses import dataclass
|
|
16
|
-
from pathlib import Path
|
|
17
|
-
from typing import Any
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
SECRET_REPLACEMENTS = [
|
|
21
|
-
(re.compile(r"sk-or-v1-[A-Za-z0-9_-]+"), "sk-or-v1-[REDACTED]"),
|
|
22
|
-
(re.compile(r"sk-[A-Za-z0-9_-]{16,}"), "sk-[REDACTED]"),
|
|
23
|
-
(re.compile(r"hf_[A-Za-z0-9]{16,}"), "hf_[REDACTED]"),
|
|
24
|
-
(re.compile(r"KGAT_[A-Za-z0-9]{16,}"), "KGAT_[REDACTED]"),
|
|
25
|
-
(re.compile(r"npm_[A-Za-z0-9]{16,}"), "npm_[REDACTED]"),
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
ORACLE_FIELD_RE = re.compile(
|
|
29
|
-
r"(^|_)(patch|test_patch|solution|answer|gold|fail_to_pass|pass_to_pass)($|_)",
|
|
30
|
-
re.IGNORECASE,
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
SAFE_FIELD_ORDER = [
|
|
34
|
-
"instance_id",
|
|
35
|
-
"task_id",
|
|
36
|
-
"repo",
|
|
37
|
-
"base_commit",
|
|
38
|
-
"version",
|
|
39
|
-
"created_at",
|
|
40
|
-
"problem_statement",
|
|
41
|
-
"hints_text",
|
|
42
|
-
"description",
|
|
43
|
-
"description_no_samples",
|
|
44
|
-
"samples",
|
|
45
|
-
"num_tests",
|
|
46
|
-
"num_samples",
|
|
47
|
-
"problem_link",
|
|
48
|
-
"problem_level",
|
|
49
|
-
"cp_id",
|
|
50
|
-
"problem_id",
|
|
51
|
-
"runtime_limit",
|
|
52
|
-
"memory_limit",
|
|
53
|
-
"runtime_limit_sentences",
|
|
54
|
-
"memory_limit_sentences",
|
|
55
|
-
"task_inst",
|
|
56
|
-
"dataset_path",
|
|
57
|
-
"dataset_folder_tree",
|
|
58
|
-
"dataset_preview",
|
|
59
|
-
"output_fname",
|
|
60
|
-
"domain_knowledge",
|
|
61
|
-
]
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
@dataclass
|
|
65
|
-
class AgentRun:
|
|
66
|
-
returncode: int
|
|
67
|
-
stdout: str
|
|
68
|
-
stderr: str
|
|
69
|
-
trace_dir: Path
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _redact(text: Any) -> str:
|
|
73
|
-
value = str(text or "")
|
|
74
|
-
for pattern, replacement in SECRET_REPLACEMENTS:
|
|
75
|
-
value = pattern.sub(replacement, value)
|
|
76
|
-
return value
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _truncate(text: str, limit: int = 50000) -> str:
|
|
80
|
-
clean = _redact(text)
|
|
81
|
-
if len(clean) <= limit:
|
|
82
|
-
return clean
|
|
83
|
-
omitted = len(clean) - limit
|
|
84
|
-
return clean[:limit] + f"\n...[truncated {omitted} chars]"
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def _safe_task_id(task_id: Any) -> str:
|
|
88
|
-
raw = str(task_id or "task")
|
|
89
|
-
safe = re.sub(r"[^A-Za-z0-9_.-]+", "-", raw).strip("-")
|
|
90
|
-
return safe or "task"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def _include_oracle_fields() -> bool:
|
|
94
|
-
return os.environ.get("CAWDEX_HAL_INCLUDE_ORACLE_FIELDS", "").lower() in {"1", "true", "yes", "on"}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def _safe_task_view(task: dict[str, Any]) -> tuple[dict[str, Any], list[str]]:
|
|
98
|
-
if _include_oracle_fields():
|
|
99
|
-
return task, []
|
|
100
|
-
|
|
101
|
-
allowed: dict[str, Any] = {}
|
|
102
|
-
omitted: list[str] = []
|
|
103
|
-
ordered_keys = [key for key in SAFE_FIELD_ORDER if key in task]
|
|
104
|
-
ordered_keys.extend(sorted(key for key in task if key not in ordered_keys))
|
|
105
|
-
for key in ordered_keys:
|
|
106
|
-
if ORACLE_FIELD_RE.search(key):
|
|
107
|
-
omitted.append(key)
|
|
108
|
-
continue
|
|
109
|
-
allowed[key] = task[key]
|
|
110
|
-
return allowed, omitted
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def _is_patch_task(task: dict[str, Any]) -> bool:
|
|
114
|
-
return bool(
|
|
115
|
-
task.get("problem_statement")
|
|
116
|
-
and (task.get("repo") or task.get("base_commit") or task.get("instance_id"))
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def _is_science_agent_task(task: dict[str, Any]) -> bool:
|
|
121
|
-
return bool(
|
|
122
|
-
task.get("task_inst")
|
|
123
|
-
and (task.get("dataset_path") or task.get("output_fname") or task.get("dataset_folder_tree"))
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def _is_appworld_task(task: dict[str, Any]) -> bool:
|
|
128
|
-
keys = set(task.keys())
|
|
129
|
-
return bool(task.get("task_id") and keys.issubset({"task_id", "instance_id"}))
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def _is_usaco_task(task: dict[str, Any]) -> bool:
|
|
133
|
-
return bool(
|
|
134
|
-
task.get("description")
|
|
135
|
-
and (task.get("samples") or task.get("cp_id") or task.get("problem_id") or task.get("problem_link"))
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def _profile_for_task(task: dict[str, Any]) -> str:
|
|
140
|
-
task_text = json.dumps(task, ensure_ascii=False).lower()
|
|
141
|
-
if _is_appworld_task(task) or "appworld" in task_text or "app-world" in task_text:
|
|
142
|
-
return "appworld"
|
|
143
|
-
if (
|
|
144
|
-
"browsecomp" in task_text
|
|
145
|
-
or "browsecomp+" in task_text
|
|
146
|
-
or "browse-comp" in task_text
|
|
147
|
-
or "deep research" in task_text
|
|
148
|
-
or "web research" in task_text
|
|
149
|
-
):
|
|
150
|
-
return "browsecomp"
|
|
151
|
-
if (
|
|
152
|
-
"tau2" in task_text
|
|
153
|
-
or "tau 2" in task_text
|
|
154
|
-
or "tau-bench" in task_text
|
|
155
|
-
or "tau_bench" in task_text
|
|
156
|
-
or "taubench" in task_text
|
|
157
|
-
or "customer support" in task_text
|
|
158
|
-
):
|
|
159
|
-
return "tau2"
|
|
160
|
-
if (
|
|
161
|
-
"roadmapbench" in task_text
|
|
162
|
-
or "roadmap-bench" in task_text
|
|
163
|
-
or "long-horizon" in task_text
|
|
164
|
-
or "long horizon" in task_text
|
|
165
|
-
or "version upgrade" in task_text
|
|
166
|
-
or "multi-target" in task_text
|
|
167
|
-
):
|
|
168
|
-
return "roadmapbench"
|
|
169
|
-
if (
|
|
170
|
-
"swe-cycle" in task_text
|
|
171
|
-
or "swecycle" in task_text
|
|
172
|
-
or "swe cycle" in task_text
|
|
173
|
-
or "swe-judge" in task_text
|
|
174
|
-
or "swejudge" in task_text
|
|
175
|
-
or "fullcycle" in task_text
|
|
176
|
-
or "codeimpl" in task_text
|
|
177
|
-
or "testgen" in task_text
|
|
178
|
-
or "run_script" in task_text
|
|
179
|
-
or "parsing_script" in task_text
|
|
180
|
-
or "selected_test_files_to_run" in task_text
|
|
181
|
-
or "environment_setup_commit" in task_text
|
|
182
|
-
or "before_repo_set_cmd" in task_text
|
|
183
|
-
or "bare repository" in task_text
|
|
184
|
-
):
|
|
185
|
-
return "swe-cycle"
|
|
186
|
-
if (
|
|
187
|
-
"swe-ci" in task_text
|
|
188
|
-
or "sweci" in task_text
|
|
189
|
-
or "swe ci" in task_text
|
|
190
|
-
or "run_tests" in task_text
|
|
191
|
-
or "define_requirements" in task_text
|
|
192
|
-
or "modify_code" in task_text
|
|
193
|
-
or "test gap" in task_text
|
|
194
|
-
or "current_sha" in task_text
|
|
195
|
-
or "target_sha" in task_text
|
|
196
|
-
or ("codebase maintenance" in task_text and "continuous integration" in task_text)
|
|
197
|
-
):
|
|
198
|
-
return "swe-ci"
|
|
199
|
-
if (
|
|
200
|
-
"swe-prbench" in task_text
|
|
201
|
-
or "swe prbench" in task_text
|
|
202
|
-
or "swe-pr" in task_text
|
|
203
|
-
or "prbench" in task_text
|
|
204
|
-
or "pull request review" in task_text
|
|
205
|
-
or "code review quality" in task_text
|
|
206
|
-
or "human_review_comments" in task_text
|
|
207
|
-
or "diff_patch" in task_text
|
|
208
|
-
or "type2_contextual" in task_text
|
|
209
|
-
):
|
|
210
|
-
return "swe-prbench"
|
|
211
|
-
if (
|
|
212
|
-
"tml-bench" in task_text
|
|
213
|
-
or "tmlbench" in task_text
|
|
214
|
-
or "tabular ml" in task_text
|
|
215
|
-
or "kaggle-style" in task_text
|
|
216
|
-
or "kaggle style" in task_text
|
|
217
|
-
or "sample_submission" in task_text
|
|
218
|
-
or "private holdout" in task_text
|
|
219
|
-
or ("train.csv" in task_text and "test.csv" in task_text and "submission" in task_text)
|
|
220
|
-
):
|
|
221
|
-
return "tml-bench"
|
|
222
|
-
if (
|
|
223
|
-
"pi-bench" in task_text
|
|
224
|
-
or "pibench" in task_text
|
|
225
|
-
or "proactive personal assistant" in task_text
|
|
226
|
-
or "proactive assistant" in task_text
|
|
227
|
-
or "hidden intent" in task_text
|
|
228
|
-
or "latent intent" in task_text
|
|
229
|
-
or "user profile" in task_text
|
|
230
|
-
or "message history" in task_text
|
|
231
|
-
or "current app" in task_text
|
|
232
|
-
or "proactivity score" in task_text
|
|
233
|
-
or "completion score" in task_text
|
|
234
|
-
):
|
|
235
|
-
return "pi-bench"
|
|
236
|
-
if (
|
|
237
|
-
"saasbench" in task_text
|
|
238
|
-
or "saas-bench" in task_text
|
|
239
|
-
or "enterprise saas" in task_text
|
|
240
|
-
or "validation nodes" in task_text
|
|
241
|
-
or "tenant" in task_text
|
|
242
|
-
or "migration" in task_text
|
|
243
|
-
):
|
|
244
|
-
return "saasbench"
|
|
245
|
-
if (
|
|
246
|
-
"swe-bench mobile" in task_text
|
|
247
|
-
or "swebench mobile" in task_text
|
|
248
|
-
or "xcode" in task_text
|
|
249
|
-
or "swift" in task_text
|
|
250
|
-
or "objective-c" in task_text
|
|
251
|
-
or "figma" in task_text
|
|
252
|
-
or "simulator" in task_text
|
|
253
|
-
):
|
|
254
|
-
return "swe-bench-mobile"
|
|
255
|
-
if (
|
|
256
|
-
"swe-webdevbench" in task_text
|
|
257
|
-
or "swe-webdev-bench" in task_text
|
|
258
|
-
or "webdevbench" in task_text
|
|
259
|
-
or "webdev-bench" in task_text
|
|
260
|
-
or "vibe coding" in task_text
|
|
261
|
-
or "virtual software agency" in task_text
|
|
262
|
-
or "canary requirement" in task_text
|
|
263
|
-
or "frontend-backend" in task_text
|
|
264
|
-
or "production readiness" in task_text
|
|
265
|
-
):
|
|
266
|
-
return "webdevbench"
|
|
267
|
-
if _is_patch_task(task):
|
|
268
|
-
return "swe-bench"
|
|
269
|
-
if (
|
|
270
|
-
"terminalworld" in task_text
|
|
271
|
-
or "terminal-world" in task_text
|
|
272
|
-
or "tw_" in task_text
|
|
273
|
-
or "asciinema" in task_text
|
|
274
|
-
):
|
|
275
|
-
return "terminalworld"
|
|
276
|
-
if "terminal-bench" in task_text or "terminalbench" in task_text:
|
|
277
|
-
return "terminal-bench"
|
|
278
|
-
if (
|
|
279
|
-
"wildclaw" in task_text
|
|
280
|
-
or "openclaw" in task_text
|
|
281
|
-
or "browsecomp" in task_text
|
|
282
|
-
or "ossworld" in task_text
|
|
283
|
-
or "bfcl" in task_text
|
|
284
|
-
or "webwalkerqa" in task_text
|
|
285
|
-
):
|
|
286
|
-
return "wildclaw"
|
|
287
|
-
if (
|
|
288
|
-
"arc-agi" in task_text
|
|
289
|
-
or "arc_agi" in task_text
|
|
290
|
-
or "arc prize" in task_text
|
|
291
|
-
or "arc-prize" in task_text
|
|
292
|
-
or "kaggle arc" in task_text
|
|
293
|
-
):
|
|
294
|
-
return "arc-agi"
|
|
295
|
-
if (
|
|
296
|
-
"specbench" in task_text
|
|
297
|
-
or "spec-bench" in task_text
|
|
298
|
-
or "spec compliance" in task_text
|
|
299
|
-
or "visible tests" in task_text
|
|
300
|
-
or "held-out" in task_text
|
|
301
|
-
or "holdout" in task_text
|
|
302
|
-
):
|
|
303
|
-
return "specbench"
|
|
304
|
-
if (
|
|
305
|
-
"reward hacking benchmark" in task_text
|
|
306
|
-
or "reward-hacking" in task_text
|
|
307
|
-
or "reward_hacking" in task_text
|
|
308
|
-
or "rhb" in task_text
|
|
309
|
-
or "evaluator tamper" in task_text
|
|
310
|
-
):
|
|
311
|
-
return "reward-hacking"
|
|
312
|
-
return "generic"
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
def _build_prompt(task_id: str, task: dict[str, Any]) -> str:
|
|
316
|
-
profile = _profile_for_task(task)
|
|
317
|
-
safe_task, omitted = _safe_task_view(task)
|
|
318
|
-
body = json.dumps(safe_task, ensure_ascii=False, indent=2, sort_keys=True)
|
|
319
|
-
|
|
320
|
-
lines = [
|
|
321
|
-
f"/benchmark {profile} HAL task {task_id}",
|
|
322
|
-
"",
|
|
323
|
-
"You are running inside the Holistic Agent Leaderboard harness.",
|
|
324
|
-
"Use Cawdex benchmark discipline: inspect local files, patch only what is needed, run targeted verification, and preserve trace evidence.",
|
|
325
|
-
]
|
|
326
|
-
if profile == "swe-bench":
|
|
327
|
-
lines.extend([
|
|
328
|
-
"This is a SWE-bench-style patch task. Modify the checked-out repository; the HAL adapter will collect the git patch after the run.",
|
|
329
|
-
"Do not edit tests or harness files unless the task explicitly asks for that.",
|
|
330
|
-
])
|
|
331
|
-
elif profile == "terminalworld":
|
|
332
|
-
lines.append("This is a TerminalWorld-style terminal workflow. Treat instruction.md/task text as the contract, avoid solve.sh/reference material, produce required persistent artifacts, and verify files/services in the environment.")
|
|
333
|
-
elif _is_science_agent_task(task):
|
|
334
|
-
lines.append("This is a ScienceAgentBench-style task. Produce a concise solution trajectory and any required output/program artifact in the final response.")
|
|
335
|
-
elif profile == "appworld" or _is_appworld_task(task):
|
|
336
|
-
lines.append("This is an AppWorld-style environment task. Interact with the environment as needed, then complete the task through the environment API.")
|
|
337
|
-
elif profile == "browsecomp":
|
|
338
|
-
lines.append("This is a BrowseComp+-style research task. Use source-grounded browsing/retrieval evidence, cross-check claims, and return the answer with auditable attribution.")
|
|
339
|
-
elif profile == "tau2":
|
|
340
|
-
lines.append("This is a tau2/Tau-Bench-style policy workflow. Follow the domain policy, use only available action schemas, and verify tool observations before completing.")
|
|
341
|
-
elif profile == "webdevbench":
|
|
342
|
-
lines.append("This is a SWE-WebDevBench-style full-stack app-agency task. Preserve canary business requirements, verify frontend-backend coupling, and collect production/security evidence when feasible.")
|
|
343
|
-
elif profile == "swe-cycle":
|
|
344
|
-
lines.append("This is a SWE-Cycle/SWE-Judge-style issue-resolution lifecycle task. Track environment setup, code implementation, verification-test generation when required, and post-edit static/dynamic judge evidence.")
|
|
345
|
-
elif profile == "swe-ci":
|
|
346
|
-
lines.append("This is a SWE-CI-style repository evolution task. Track current/target commits, test gaps, requirement derivation, and CI-loop validation across run_tests -> define_requirements -> modify_code.")
|
|
347
|
-
elif profile == "swe-prbench":
|
|
348
|
-
lines.append("This is a SWE-PRBench-style pull request review task. Inspect PR metadata and diff first, expand context only for concrete suspected issues, and return severity-rated review findings with file/line evidence instead of patching unless explicitly requested.")
|
|
349
|
-
elif profile == "tml-bench":
|
|
350
|
-
lines.append("This is a TML-Bench/Kaggle-style tabular ML task. Build the data contract first, avoid hidden-label leakage, train an honest baseline, and produce a sample_submission-compatible artifact with validation evidence.")
|
|
351
|
-
elif profile == "pi-bench":
|
|
352
|
-
lines.append("This is a Pi-Bench-style proactive personal assistant task. Build the user/workspace/app context contract, infer hidden intents carefully, ask one focused clarification when needed, and verify observable state after proactive actions.")
|
|
353
|
-
elif _is_usaco_task(task):
|
|
354
|
-
lines.append("This is a USACO-style programming task. Produce the final code solution in the final response.")
|
|
355
|
-
else:
|
|
356
|
-
lines.append("Return the final task response clearly; the HAL adapter will store it in the task response field.")
|
|
357
|
-
|
|
358
|
-
if omitted:
|
|
359
|
-
lines.append("Oracle-like task fields omitted from the prompt by default: " + ", ".join(sorted(omitted)) + ".")
|
|
360
|
-
|
|
361
|
-
lines.extend(["", "## HAL task data", _truncate(body)])
|
|
362
|
-
return "\n".join(lines)
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
def _base_command() -> list[str]:
|
|
366
|
-
command = os.environ.get("CAWDEX_HAL_COMMAND") or os.environ.get("CAWDEX_HAL_COMMAND", "cawdex")
|
|
367
|
-
parts = shlex.split(command, posix=os.name != "nt")
|
|
368
|
-
return parts or ["cawdex"]
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
def _append_flag(args: list[str], flag: str, value: Any) -> None:
|
|
372
|
-
if value is None:
|
|
373
|
-
return
|
|
374
|
-
text = str(value).strip()
|
|
375
|
-
if not text:
|
|
376
|
-
return
|
|
377
|
-
args.extend([flag, text])
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
def _run_cawdex(task_id: str, prompt: str, kwargs: dict[str, Any]) -> AgentRun:
|
|
381
|
-
trace_root = Path(os.environ.get("CAWDEX_HAL_TRACE_DIR", ".cawdex/hal-trace"))
|
|
382
|
-
trace_dir = trace_root / _safe_task_id(task_id)
|
|
383
|
-
trace_dir.mkdir(parents=True, exist_ok=True)
|
|
384
|
-
|
|
385
|
-
env = os.environ.copy()
|
|
386
|
-
env.setdefault("CAWDEX_ENV_CONFIG", "1")
|
|
387
|
-
env.setdefault("CAWDEX_THEME", "minimal")
|
|
388
|
-
env.setdefault("CAWDEX_SHOW_THINKING", "0")
|
|
389
|
-
env.setdefault("CAWDEX_MEMORY", "0")
|
|
390
|
-
env.setdefault("CAWDEX_BASH_TIMEOUT_MS", "300000")
|
|
391
|
-
|
|
392
|
-
args = _base_command()
|
|
393
|
-
args.extend([
|
|
394
|
-
"--prompt",
|
|
395
|
-
prompt,
|
|
396
|
-
"--perm",
|
|
397
|
-
"yolo",
|
|
398
|
-
"--benchmark-trace-dir",
|
|
399
|
-
str(trace_dir),
|
|
400
|
-
])
|
|
401
|
-
_append_flag(args, "--model", kwargs.get("model_name") or kwargs.get("model"))
|
|
402
|
-
_append_flag(args, "--provider", kwargs.get("provider"))
|
|
403
|
-
_append_flag(args, "--max-turns", kwargs.get("max_turns"))
|
|
404
|
-
_append_flag(args, "--max-tokens", kwargs.get("max_tokens"))
|
|
405
|
-
_append_flag(args, "--temperature", kwargs.get("temperature"))
|
|
406
|
-
_append_flag(args, "--output-format", kwargs.get("output_format"))
|
|
407
|
-
|
|
408
|
-
timeout = int(os.environ.get("CAWDEX_HAL_TIMEOUT_SEC", "1800"))
|
|
409
|
-
try:
|
|
410
|
-
completed = subprocess.run(
|
|
411
|
-
args,
|
|
412
|
-
cwd=os.getcwd(),
|
|
413
|
-
env=env,
|
|
414
|
-
text=True,
|
|
415
|
-
capture_output=True,
|
|
416
|
-
timeout=timeout,
|
|
417
|
-
check=False,
|
|
418
|
-
)
|
|
419
|
-
stdout = _redact(completed.stdout)
|
|
420
|
-
stderr = _redact(completed.stderr)
|
|
421
|
-
returncode = completed.returncode
|
|
422
|
-
except subprocess.TimeoutExpired as exc:
|
|
423
|
-
stdout = _redact(exc.stdout)
|
|
424
|
-
stderr = _redact(exc.stderr) + f"\nCawdex timed out after {timeout}s"
|
|
425
|
-
returncode = 124
|
|
426
|
-
|
|
427
|
-
(trace_dir / "hal-stdout.txt").write_text(stdout, encoding="utf-8")
|
|
428
|
-
(trace_dir / "hal-stderr.txt").write_text(stderr, encoding="utf-8")
|
|
429
|
-
return AgentRun(returncode=returncode, stdout=stdout, stderr=stderr, trace_dir=trace_dir)
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
def _run_git(args: list[str], cwd: Path | None = None) -> str:
|
|
433
|
-
try:
|
|
434
|
-
completed = subprocess.run(
|
|
435
|
-
["git", *args],
|
|
436
|
-
cwd=str(cwd) if cwd else None,
|
|
437
|
-
text=True,
|
|
438
|
-
capture_output=True,
|
|
439
|
-
check=False,
|
|
440
|
-
timeout=60,
|
|
441
|
-
)
|
|
442
|
-
except Exception:
|
|
443
|
-
return ""
|
|
444
|
-
if completed.returncode not in {0, 1}:
|
|
445
|
-
return ""
|
|
446
|
-
return _redact(completed.stdout)
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
def _latest_trace_patch(trace_dir: Path) -> str:
|
|
450
|
-
patches = sorted(
|
|
451
|
-
trace_dir.rglob("worktree.patch"),
|
|
452
|
-
key=lambda path: path.stat().st_mtime,
|
|
453
|
-
reverse=True,
|
|
454
|
-
)
|
|
455
|
-
for patch in patches:
|
|
456
|
-
try:
|
|
457
|
-
text = _redact(patch.read_text(encoding="utf-8", errors="replace"))
|
|
458
|
-
if text.strip():
|
|
459
|
-
return text
|
|
460
|
-
except OSError:
|
|
461
|
-
continue
|
|
462
|
-
return ""
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
def _collect_git_patch(trace_dir: Path) -> str:
|
|
466
|
-
trace_patch = _latest_trace_patch(trace_dir)
|
|
467
|
-
if trace_patch:
|
|
468
|
-
return trace_patch
|
|
469
|
-
|
|
470
|
-
parts = [
|
|
471
|
-
_run_git(["diff", "--binary", "--no-ext-diff"]),
|
|
472
|
-
_run_git(["diff", "--cached", "--binary", "--no-ext-diff"]),
|
|
473
|
-
]
|
|
474
|
-
if os.name != "nt":
|
|
475
|
-
raw_untracked = _run_git(["ls-files", "--others", "--exclude-standard", "-z"])
|
|
476
|
-
for filename in raw_untracked.split("\0"):
|
|
477
|
-
if filename:
|
|
478
|
-
parts.append(_run_git(["diff", "--no-index", "--binary", "--no-ext-diff", "--", "/dev/null", filename]))
|
|
479
|
-
return "".join(part for part in parts if part)
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
def _latest_summary(trace_dir: Path) -> dict[str, Any]:
|
|
483
|
-
summaries = sorted(
|
|
484
|
-
trace_dir.rglob("summary.json"),
|
|
485
|
-
key=lambda path: path.stat().st_mtime,
|
|
486
|
-
reverse=True,
|
|
487
|
-
)
|
|
488
|
-
for summary in summaries:
|
|
489
|
-
try:
|
|
490
|
-
return json.loads(summary.read_text(encoding="utf-8"))
|
|
491
|
-
except Exception:
|
|
492
|
-
continue
|
|
493
|
-
return {}
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
def _response_text(run_result: AgentRun) -> str:
|
|
497
|
-
summary = _latest_summary(run_result.trace_dir)
|
|
498
|
-
final = summary.get("finalAssistant")
|
|
499
|
-
if isinstance(final, str) and final.strip():
|
|
500
|
-
return _truncate(final, 20000)
|
|
501
|
-
combined = "\n".join(part for part in [run_result.stdout, run_result.stderr] if part)
|
|
502
|
-
return _truncate(combined, 20000)
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
def _submission_for_task(task: dict[str, Any], run_result: AgentRun) -> Any:
|
|
506
|
-
response = _response_text(run_result)
|
|
507
|
-
if _is_science_agent_task(task):
|
|
508
|
-
return response
|
|
509
|
-
if _is_appworld_task(task):
|
|
510
|
-
return "Completed" if run_result.returncode == 0 else response
|
|
511
|
-
|
|
512
|
-
updated = dict(task)
|
|
513
|
-
updated["response"] = response
|
|
514
|
-
return updated
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
def run(input: dict[str, dict[str, Any]], **kwargs: Any) -> dict[str, Any]:
|
|
518
|
-
"""Run Cawdex for HAL.
|
|
519
|
-
|
|
520
|
-
Patch-style tasks return {task_id: patch}. ScienceAgentBench-style tasks
|
|
521
|
-
return a trajectory string. AppWorld-style tasks return "Completed" after
|
|
522
|
-
a successful run. Other text/code tasks return the original task dict with
|
|
523
|
-
a response field, matching HAL's USACO-style pattern.
|
|
524
|
-
"""
|
|
525
|
-
if not isinstance(input, dict):
|
|
526
|
-
raise TypeError("Cawdex HAL adapter expects input to be a dictionary")
|
|
527
|
-
|
|
528
|
-
patch_task_ids = [
|
|
529
|
-
str(task_id)
|
|
530
|
-
for task_id, task in input.items()
|
|
531
|
-
if isinstance(task, dict) and _is_patch_task(task)
|
|
532
|
-
]
|
|
533
|
-
if len(patch_task_ids) > 1:
|
|
534
|
-
raise ValueError("Cawdex HAL adapter expects one patch-style task per checked-out worktree")
|
|
535
|
-
|
|
536
|
-
output: dict[str, Any] = {}
|
|
537
|
-
for task_id, task in input.items():
|
|
538
|
-
if not isinstance(task, dict):
|
|
539
|
-
output[str(task_id)] = task
|
|
540
|
-
continue
|
|
541
|
-
|
|
542
|
-
prompt = _build_prompt(str(task_id), task)
|
|
543
|
-
run_result = _run_cawdex(str(task_id), prompt, kwargs)
|
|
544
|
-
|
|
545
|
-
if _is_patch_task(task):
|
|
546
|
-
output[str(task_id)] = _collect_git_patch(run_result.trace_dir)
|
|
547
|
-
else:
|
|
548
|
-
output[str(task_id)] = _submission_for_task(task, run_result)
|
|
549
|
-
|
|
550
|
-
return output
|
|
1
|
+
"""HAL custom-agent adapter for Cawdex.
|
|
2
|
+
|
|
3
|
+
HAL expects a module-level run(input, **kwargs) function. This adapter keeps
|
|
4
|
+
Cawdex framework-agnostic by launching the installed CLI in headless
|
|
5
|
+
benchmark mode, then returning the artifact shape expected by common HAL tasks.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import re
|
|
13
|
+
import shlex
|
|
14
|
+
import subprocess
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
SECRET_REPLACEMENTS = [
|
|
21
|
+
(re.compile(r"sk-or-v1-[A-Za-z0-9_-]+"), "sk-or-v1-[REDACTED]"),
|
|
22
|
+
(re.compile(r"sk-[A-Za-z0-9_-]{16,}"), "sk-[REDACTED]"),
|
|
23
|
+
(re.compile(r"hf_[A-Za-z0-9]{16,}"), "hf_[REDACTED]"),
|
|
24
|
+
(re.compile(r"KGAT_[A-Za-z0-9]{16,}"), "KGAT_[REDACTED]"),
|
|
25
|
+
(re.compile(r"npm_[A-Za-z0-9]{16,}"), "npm_[REDACTED]"),
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
ORACLE_FIELD_RE = re.compile(
|
|
29
|
+
r"(^|_)(patch|test_patch|solution|answer|gold|fail_to_pass|pass_to_pass)($|_)",
|
|
30
|
+
re.IGNORECASE,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
SAFE_FIELD_ORDER = [
|
|
34
|
+
"instance_id",
|
|
35
|
+
"task_id",
|
|
36
|
+
"repo",
|
|
37
|
+
"base_commit",
|
|
38
|
+
"version",
|
|
39
|
+
"created_at",
|
|
40
|
+
"problem_statement",
|
|
41
|
+
"hints_text",
|
|
42
|
+
"description",
|
|
43
|
+
"description_no_samples",
|
|
44
|
+
"samples",
|
|
45
|
+
"num_tests",
|
|
46
|
+
"num_samples",
|
|
47
|
+
"problem_link",
|
|
48
|
+
"problem_level",
|
|
49
|
+
"cp_id",
|
|
50
|
+
"problem_id",
|
|
51
|
+
"runtime_limit",
|
|
52
|
+
"memory_limit",
|
|
53
|
+
"runtime_limit_sentences",
|
|
54
|
+
"memory_limit_sentences",
|
|
55
|
+
"task_inst",
|
|
56
|
+
"dataset_path",
|
|
57
|
+
"dataset_folder_tree",
|
|
58
|
+
"dataset_preview",
|
|
59
|
+
"output_fname",
|
|
60
|
+
"domain_knowledge",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass
|
|
65
|
+
class AgentRun:
|
|
66
|
+
returncode: int
|
|
67
|
+
stdout: str
|
|
68
|
+
stderr: str
|
|
69
|
+
trace_dir: Path
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _redact(text: Any) -> str:
|
|
73
|
+
value = str(text or "")
|
|
74
|
+
for pattern, replacement in SECRET_REPLACEMENTS:
|
|
75
|
+
value = pattern.sub(replacement, value)
|
|
76
|
+
return value
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _truncate(text: str, limit: int = 50000) -> str:
|
|
80
|
+
clean = _redact(text)
|
|
81
|
+
if len(clean) <= limit:
|
|
82
|
+
return clean
|
|
83
|
+
omitted = len(clean) - limit
|
|
84
|
+
return clean[:limit] + f"\n...[truncated {omitted} chars]"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _safe_task_id(task_id: Any) -> str:
|
|
88
|
+
raw = str(task_id or "task")
|
|
89
|
+
safe = re.sub(r"[^A-Za-z0-9_.-]+", "-", raw).strip("-")
|
|
90
|
+
return safe or "task"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _include_oracle_fields() -> bool:
|
|
94
|
+
return os.environ.get("CAWDEX_HAL_INCLUDE_ORACLE_FIELDS", "").lower() in {"1", "true", "yes", "on"}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _safe_task_view(task: dict[str, Any]) -> tuple[dict[str, Any], list[str]]:
|
|
98
|
+
if _include_oracle_fields():
|
|
99
|
+
return task, []
|
|
100
|
+
|
|
101
|
+
allowed: dict[str, Any] = {}
|
|
102
|
+
omitted: list[str] = []
|
|
103
|
+
ordered_keys = [key for key in SAFE_FIELD_ORDER if key in task]
|
|
104
|
+
ordered_keys.extend(sorted(key for key in task if key not in ordered_keys))
|
|
105
|
+
for key in ordered_keys:
|
|
106
|
+
if ORACLE_FIELD_RE.search(key):
|
|
107
|
+
omitted.append(key)
|
|
108
|
+
continue
|
|
109
|
+
allowed[key] = task[key]
|
|
110
|
+
return allowed, omitted
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _is_patch_task(task: dict[str, Any]) -> bool:
|
|
114
|
+
return bool(
|
|
115
|
+
task.get("problem_statement")
|
|
116
|
+
and (task.get("repo") or task.get("base_commit") or task.get("instance_id"))
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _is_science_agent_task(task: dict[str, Any]) -> bool:
|
|
121
|
+
return bool(
|
|
122
|
+
task.get("task_inst")
|
|
123
|
+
and (task.get("dataset_path") or task.get("output_fname") or task.get("dataset_folder_tree"))
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _is_appworld_task(task: dict[str, Any]) -> bool:
|
|
128
|
+
keys = set(task.keys())
|
|
129
|
+
return bool(task.get("task_id") and keys.issubset({"task_id", "instance_id"}))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _is_usaco_task(task: dict[str, Any]) -> bool:
|
|
133
|
+
return bool(
|
|
134
|
+
task.get("description")
|
|
135
|
+
and (task.get("samples") or task.get("cp_id") or task.get("problem_id") or task.get("problem_link"))
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _profile_for_task(task: dict[str, Any]) -> str:
|
|
140
|
+
task_text = json.dumps(task, ensure_ascii=False).lower()
|
|
141
|
+
if _is_appworld_task(task) or "appworld" in task_text or "app-world" in task_text:
|
|
142
|
+
return "appworld"
|
|
143
|
+
if (
|
|
144
|
+
"browsecomp" in task_text
|
|
145
|
+
or "browsecomp+" in task_text
|
|
146
|
+
or "browse-comp" in task_text
|
|
147
|
+
or "deep research" in task_text
|
|
148
|
+
or "web research" in task_text
|
|
149
|
+
):
|
|
150
|
+
return "browsecomp"
|
|
151
|
+
if (
|
|
152
|
+
"tau2" in task_text
|
|
153
|
+
or "tau 2" in task_text
|
|
154
|
+
or "tau-bench" in task_text
|
|
155
|
+
or "tau_bench" in task_text
|
|
156
|
+
or "taubench" in task_text
|
|
157
|
+
or "customer support" in task_text
|
|
158
|
+
):
|
|
159
|
+
return "tau2"
|
|
160
|
+
if (
|
|
161
|
+
"roadmapbench" in task_text
|
|
162
|
+
or "roadmap-bench" in task_text
|
|
163
|
+
or "long-horizon" in task_text
|
|
164
|
+
or "long horizon" in task_text
|
|
165
|
+
or "version upgrade" in task_text
|
|
166
|
+
or "multi-target" in task_text
|
|
167
|
+
):
|
|
168
|
+
return "roadmapbench"
|
|
169
|
+
if (
|
|
170
|
+
"swe-cycle" in task_text
|
|
171
|
+
or "swecycle" in task_text
|
|
172
|
+
or "swe cycle" in task_text
|
|
173
|
+
or "swe-judge" in task_text
|
|
174
|
+
or "swejudge" in task_text
|
|
175
|
+
or "fullcycle" in task_text
|
|
176
|
+
or "codeimpl" in task_text
|
|
177
|
+
or "testgen" in task_text
|
|
178
|
+
or "run_script" in task_text
|
|
179
|
+
or "parsing_script" in task_text
|
|
180
|
+
or "selected_test_files_to_run" in task_text
|
|
181
|
+
or "environment_setup_commit" in task_text
|
|
182
|
+
or "before_repo_set_cmd" in task_text
|
|
183
|
+
or "bare repository" in task_text
|
|
184
|
+
):
|
|
185
|
+
return "swe-cycle"
|
|
186
|
+
if (
|
|
187
|
+
"swe-ci" in task_text
|
|
188
|
+
or "sweci" in task_text
|
|
189
|
+
or "swe ci" in task_text
|
|
190
|
+
or "run_tests" in task_text
|
|
191
|
+
or "define_requirements" in task_text
|
|
192
|
+
or "modify_code" in task_text
|
|
193
|
+
or "test gap" in task_text
|
|
194
|
+
or "current_sha" in task_text
|
|
195
|
+
or "target_sha" in task_text
|
|
196
|
+
or ("codebase maintenance" in task_text and "continuous integration" in task_text)
|
|
197
|
+
):
|
|
198
|
+
return "swe-ci"
|
|
199
|
+
if (
|
|
200
|
+
"swe-prbench" in task_text
|
|
201
|
+
or "swe prbench" in task_text
|
|
202
|
+
or "swe-pr" in task_text
|
|
203
|
+
or "prbench" in task_text
|
|
204
|
+
or "pull request review" in task_text
|
|
205
|
+
or "code review quality" in task_text
|
|
206
|
+
or "human_review_comments" in task_text
|
|
207
|
+
or "diff_patch" in task_text
|
|
208
|
+
or "type2_contextual" in task_text
|
|
209
|
+
):
|
|
210
|
+
return "swe-prbench"
|
|
211
|
+
if (
|
|
212
|
+
"tml-bench" in task_text
|
|
213
|
+
or "tmlbench" in task_text
|
|
214
|
+
or "tabular ml" in task_text
|
|
215
|
+
or "kaggle-style" in task_text
|
|
216
|
+
or "kaggle style" in task_text
|
|
217
|
+
or "sample_submission" in task_text
|
|
218
|
+
or "private holdout" in task_text
|
|
219
|
+
or ("train.csv" in task_text and "test.csv" in task_text and "submission" in task_text)
|
|
220
|
+
):
|
|
221
|
+
return "tml-bench"
|
|
222
|
+
if (
|
|
223
|
+
"pi-bench" in task_text
|
|
224
|
+
or "pibench" in task_text
|
|
225
|
+
or "proactive personal assistant" in task_text
|
|
226
|
+
or "proactive assistant" in task_text
|
|
227
|
+
or "hidden intent" in task_text
|
|
228
|
+
or "latent intent" in task_text
|
|
229
|
+
or "user profile" in task_text
|
|
230
|
+
or "message history" in task_text
|
|
231
|
+
or "current app" in task_text
|
|
232
|
+
or "proactivity score" in task_text
|
|
233
|
+
or "completion score" in task_text
|
|
234
|
+
):
|
|
235
|
+
return "pi-bench"
|
|
236
|
+
if (
|
|
237
|
+
"saasbench" in task_text
|
|
238
|
+
or "saas-bench" in task_text
|
|
239
|
+
or "enterprise saas" in task_text
|
|
240
|
+
or "validation nodes" in task_text
|
|
241
|
+
or "tenant" in task_text
|
|
242
|
+
or "migration" in task_text
|
|
243
|
+
):
|
|
244
|
+
return "saasbench"
|
|
245
|
+
if (
|
|
246
|
+
"swe-bench mobile" in task_text
|
|
247
|
+
or "swebench mobile" in task_text
|
|
248
|
+
or "xcode" in task_text
|
|
249
|
+
or "swift" in task_text
|
|
250
|
+
or "objective-c" in task_text
|
|
251
|
+
or "figma" in task_text
|
|
252
|
+
or "simulator" in task_text
|
|
253
|
+
):
|
|
254
|
+
return "swe-bench-mobile"
|
|
255
|
+
if (
|
|
256
|
+
"swe-webdevbench" in task_text
|
|
257
|
+
or "swe-webdev-bench" in task_text
|
|
258
|
+
or "webdevbench" in task_text
|
|
259
|
+
or "webdev-bench" in task_text
|
|
260
|
+
or "vibe coding" in task_text
|
|
261
|
+
or "virtual software agency" in task_text
|
|
262
|
+
or "canary requirement" in task_text
|
|
263
|
+
or "frontend-backend" in task_text
|
|
264
|
+
or "production readiness" in task_text
|
|
265
|
+
):
|
|
266
|
+
return "webdevbench"
|
|
267
|
+
if _is_patch_task(task):
|
|
268
|
+
return "swe-bench"
|
|
269
|
+
if (
|
|
270
|
+
"terminalworld" in task_text
|
|
271
|
+
or "terminal-world" in task_text
|
|
272
|
+
or "tw_" in task_text
|
|
273
|
+
or "asciinema" in task_text
|
|
274
|
+
):
|
|
275
|
+
return "terminalworld"
|
|
276
|
+
if "terminal-bench" in task_text or "terminalbench" in task_text:
|
|
277
|
+
return "terminal-bench"
|
|
278
|
+
if (
|
|
279
|
+
"wildclaw" in task_text
|
|
280
|
+
or "openclaw" in task_text
|
|
281
|
+
or "browsecomp" in task_text
|
|
282
|
+
or "ossworld" in task_text
|
|
283
|
+
or "bfcl" in task_text
|
|
284
|
+
or "webwalkerqa" in task_text
|
|
285
|
+
):
|
|
286
|
+
return "wildclaw"
|
|
287
|
+
if (
|
|
288
|
+
"arc-agi" in task_text
|
|
289
|
+
or "arc_agi" in task_text
|
|
290
|
+
or "arc prize" in task_text
|
|
291
|
+
or "arc-prize" in task_text
|
|
292
|
+
or "kaggle arc" in task_text
|
|
293
|
+
):
|
|
294
|
+
return "arc-agi"
|
|
295
|
+
if (
|
|
296
|
+
"specbench" in task_text
|
|
297
|
+
or "spec-bench" in task_text
|
|
298
|
+
or "spec compliance" in task_text
|
|
299
|
+
or "visible tests" in task_text
|
|
300
|
+
or "held-out" in task_text
|
|
301
|
+
or "holdout" in task_text
|
|
302
|
+
):
|
|
303
|
+
return "specbench"
|
|
304
|
+
if (
|
|
305
|
+
"reward hacking benchmark" in task_text
|
|
306
|
+
or "reward-hacking" in task_text
|
|
307
|
+
or "reward_hacking" in task_text
|
|
308
|
+
or "rhb" in task_text
|
|
309
|
+
or "evaluator tamper" in task_text
|
|
310
|
+
):
|
|
311
|
+
return "reward-hacking"
|
|
312
|
+
return "generic"
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _build_prompt(task_id: str, task: dict[str, Any]) -> str:
|
|
316
|
+
profile = _profile_for_task(task)
|
|
317
|
+
safe_task, omitted = _safe_task_view(task)
|
|
318
|
+
body = json.dumps(safe_task, ensure_ascii=False, indent=2, sort_keys=True)
|
|
319
|
+
|
|
320
|
+
lines = [
|
|
321
|
+
f"/benchmark {profile} HAL task {task_id}",
|
|
322
|
+
"",
|
|
323
|
+
"You are running inside the Holistic Agent Leaderboard harness.",
|
|
324
|
+
"Use Cawdex benchmark discipline: inspect local files, patch only what is needed, run targeted verification, and preserve trace evidence.",
|
|
325
|
+
]
|
|
326
|
+
if profile == "swe-bench":
|
|
327
|
+
lines.extend([
|
|
328
|
+
"This is a SWE-bench-style patch task. Modify the checked-out repository; the HAL adapter will collect the git patch after the run.",
|
|
329
|
+
"Do not edit tests or harness files unless the task explicitly asks for that.",
|
|
330
|
+
])
|
|
331
|
+
elif profile == "terminalworld":
|
|
332
|
+
lines.append("This is a TerminalWorld-style terminal workflow. Treat instruction.md/task text as the contract, avoid solve.sh/reference material, produce required persistent artifacts, and verify files/services in the environment.")
|
|
333
|
+
elif _is_science_agent_task(task):
|
|
334
|
+
lines.append("This is a ScienceAgentBench-style task. Produce a concise solution trajectory and any required output/program artifact in the final response.")
|
|
335
|
+
elif profile == "appworld" or _is_appworld_task(task):
|
|
336
|
+
lines.append("This is an AppWorld-style environment task. Interact with the environment as needed, then complete the task through the environment API.")
|
|
337
|
+
elif profile == "browsecomp":
|
|
338
|
+
lines.append("This is a BrowseComp+-style research task. Use source-grounded browsing/retrieval evidence, cross-check claims, and return the answer with auditable attribution.")
|
|
339
|
+
elif profile == "tau2":
|
|
340
|
+
lines.append("This is a tau2/Tau-Bench-style policy workflow. Follow the domain policy, use only available action schemas, and verify tool observations before completing.")
|
|
341
|
+
elif profile == "webdevbench":
|
|
342
|
+
lines.append("This is a SWE-WebDevBench-style full-stack app-agency task. Preserve canary business requirements, verify frontend-backend coupling, and collect production/security evidence when feasible.")
|
|
343
|
+
elif profile == "swe-cycle":
|
|
344
|
+
lines.append("This is a SWE-Cycle/SWE-Judge-style issue-resolution lifecycle task. Track environment setup, code implementation, verification-test generation when required, and post-edit static/dynamic judge evidence.")
|
|
345
|
+
elif profile == "swe-ci":
|
|
346
|
+
lines.append("This is a SWE-CI-style repository evolution task. Track current/target commits, test gaps, requirement derivation, and CI-loop validation across run_tests -> define_requirements -> modify_code.")
|
|
347
|
+
elif profile == "swe-prbench":
|
|
348
|
+
lines.append("This is a SWE-PRBench-style pull request review task. Inspect PR metadata and diff first, expand context only for concrete suspected issues, and return severity-rated review findings with file/line evidence instead of patching unless explicitly requested.")
|
|
349
|
+
elif profile == "tml-bench":
|
|
350
|
+
lines.append("This is a TML-Bench/Kaggle-style tabular ML task. Build the data contract first, avoid hidden-label leakage, train an honest baseline, and produce a sample_submission-compatible artifact with validation evidence.")
|
|
351
|
+
elif profile == "pi-bench":
|
|
352
|
+
lines.append("This is a Pi-Bench-style proactive personal assistant task. Build the user/workspace/app context contract, infer hidden intents carefully, ask one focused clarification when needed, and verify observable state after proactive actions.")
|
|
353
|
+
elif _is_usaco_task(task):
|
|
354
|
+
lines.append("This is a USACO-style programming task. Produce the final code solution in the final response.")
|
|
355
|
+
else:
|
|
356
|
+
lines.append("Return the final task response clearly; the HAL adapter will store it in the task response field.")
|
|
357
|
+
|
|
358
|
+
if omitted:
|
|
359
|
+
lines.append("Oracle-like task fields omitted from the prompt by default: " + ", ".join(sorted(omitted)) + ".")
|
|
360
|
+
|
|
361
|
+
lines.extend(["", "## HAL task data", _truncate(body)])
|
|
362
|
+
return "\n".join(lines)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _base_command() -> list[str]:
|
|
366
|
+
command = os.environ.get("CAWDEX_HAL_COMMAND") or os.environ.get("CAWDEX_HAL_COMMAND", "cawdex")
|
|
367
|
+
parts = shlex.split(command, posix=os.name != "nt")
|
|
368
|
+
return parts or ["cawdex"]
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _append_flag(args: list[str], flag: str, value: Any) -> None:
|
|
372
|
+
if value is None:
|
|
373
|
+
return
|
|
374
|
+
text = str(value).strip()
|
|
375
|
+
if not text:
|
|
376
|
+
return
|
|
377
|
+
args.extend([flag, text])
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _run_cawdex(task_id: str, prompt: str, kwargs: dict[str, Any]) -> AgentRun:
|
|
381
|
+
trace_root = Path(os.environ.get("CAWDEX_HAL_TRACE_DIR", ".cawdex/hal-trace"))
|
|
382
|
+
trace_dir = trace_root / _safe_task_id(task_id)
|
|
383
|
+
trace_dir.mkdir(parents=True, exist_ok=True)
|
|
384
|
+
|
|
385
|
+
env = os.environ.copy()
|
|
386
|
+
env.setdefault("CAWDEX_ENV_CONFIG", "1")
|
|
387
|
+
env.setdefault("CAWDEX_THEME", "minimal")
|
|
388
|
+
env.setdefault("CAWDEX_SHOW_THINKING", "0")
|
|
389
|
+
env.setdefault("CAWDEX_MEMORY", "0")
|
|
390
|
+
env.setdefault("CAWDEX_BASH_TIMEOUT_MS", "300000")
|
|
391
|
+
|
|
392
|
+
args = _base_command()
|
|
393
|
+
args.extend([
|
|
394
|
+
"--prompt",
|
|
395
|
+
prompt,
|
|
396
|
+
"--perm",
|
|
397
|
+
"yolo",
|
|
398
|
+
"--benchmark-trace-dir",
|
|
399
|
+
str(trace_dir),
|
|
400
|
+
])
|
|
401
|
+
_append_flag(args, "--model", kwargs.get("model_name") or kwargs.get("model"))
|
|
402
|
+
_append_flag(args, "--provider", kwargs.get("provider"))
|
|
403
|
+
_append_flag(args, "--max-turns", kwargs.get("max_turns"))
|
|
404
|
+
_append_flag(args, "--max-tokens", kwargs.get("max_tokens"))
|
|
405
|
+
_append_flag(args, "--temperature", kwargs.get("temperature"))
|
|
406
|
+
_append_flag(args, "--output-format", kwargs.get("output_format"))
|
|
407
|
+
|
|
408
|
+
timeout = int(os.environ.get("CAWDEX_HAL_TIMEOUT_SEC", "1800"))
|
|
409
|
+
try:
|
|
410
|
+
completed = subprocess.run(
|
|
411
|
+
args,
|
|
412
|
+
cwd=os.getcwd(),
|
|
413
|
+
env=env,
|
|
414
|
+
text=True,
|
|
415
|
+
capture_output=True,
|
|
416
|
+
timeout=timeout,
|
|
417
|
+
check=False,
|
|
418
|
+
)
|
|
419
|
+
stdout = _redact(completed.stdout)
|
|
420
|
+
stderr = _redact(completed.stderr)
|
|
421
|
+
returncode = completed.returncode
|
|
422
|
+
except subprocess.TimeoutExpired as exc:
|
|
423
|
+
stdout = _redact(exc.stdout)
|
|
424
|
+
stderr = _redact(exc.stderr) + f"\nCawdex timed out after {timeout}s"
|
|
425
|
+
returncode = 124
|
|
426
|
+
|
|
427
|
+
(trace_dir / "hal-stdout.txt").write_text(stdout, encoding="utf-8")
|
|
428
|
+
(trace_dir / "hal-stderr.txt").write_text(stderr, encoding="utf-8")
|
|
429
|
+
return AgentRun(returncode=returncode, stdout=stdout, stderr=stderr, trace_dir=trace_dir)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _run_git(args: list[str], cwd: Path | None = None) -> str:
|
|
433
|
+
try:
|
|
434
|
+
completed = subprocess.run(
|
|
435
|
+
["git", *args],
|
|
436
|
+
cwd=str(cwd) if cwd else None,
|
|
437
|
+
text=True,
|
|
438
|
+
capture_output=True,
|
|
439
|
+
check=False,
|
|
440
|
+
timeout=60,
|
|
441
|
+
)
|
|
442
|
+
except Exception:
|
|
443
|
+
return ""
|
|
444
|
+
if completed.returncode not in {0, 1}:
|
|
445
|
+
return ""
|
|
446
|
+
return _redact(completed.stdout)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _latest_trace_patch(trace_dir: Path) -> str:
|
|
450
|
+
patches = sorted(
|
|
451
|
+
trace_dir.rglob("worktree.patch"),
|
|
452
|
+
key=lambda path: path.stat().st_mtime,
|
|
453
|
+
reverse=True,
|
|
454
|
+
)
|
|
455
|
+
for patch in patches:
|
|
456
|
+
try:
|
|
457
|
+
text = _redact(patch.read_text(encoding="utf-8", errors="replace"))
|
|
458
|
+
if text.strip():
|
|
459
|
+
return text
|
|
460
|
+
except OSError:
|
|
461
|
+
continue
|
|
462
|
+
return ""
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def _collect_git_patch(trace_dir: Path) -> str:
|
|
466
|
+
trace_patch = _latest_trace_patch(trace_dir)
|
|
467
|
+
if trace_patch:
|
|
468
|
+
return trace_patch
|
|
469
|
+
|
|
470
|
+
parts = [
|
|
471
|
+
_run_git(["diff", "--binary", "--no-ext-diff"]),
|
|
472
|
+
_run_git(["diff", "--cached", "--binary", "--no-ext-diff"]),
|
|
473
|
+
]
|
|
474
|
+
if os.name != "nt":
|
|
475
|
+
raw_untracked = _run_git(["ls-files", "--others", "--exclude-standard", "-z"])
|
|
476
|
+
for filename in raw_untracked.split("\0"):
|
|
477
|
+
if filename:
|
|
478
|
+
parts.append(_run_git(["diff", "--no-index", "--binary", "--no-ext-diff", "--", "/dev/null", filename]))
|
|
479
|
+
return "".join(part for part in parts if part)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def _latest_summary(trace_dir: Path) -> dict[str, Any]:
|
|
483
|
+
summaries = sorted(
|
|
484
|
+
trace_dir.rglob("summary.json"),
|
|
485
|
+
key=lambda path: path.stat().st_mtime,
|
|
486
|
+
reverse=True,
|
|
487
|
+
)
|
|
488
|
+
for summary in summaries:
|
|
489
|
+
try:
|
|
490
|
+
return json.loads(summary.read_text(encoding="utf-8"))
|
|
491
|
+
except Exception:
|
|
492
|
+
continue
|
|
493
|
+
return {}
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _response_text(run_result: AgentRun) -> str:
|
|
497
|
+
summary = _latest_summary(run_result.trace_dir)
|
|
498
|
+
final = summary.get("finalAssistant")
|
|
499
|
+
if isinstance(final, str) and final.strip():
|
|
500
|
+
return _truncate(final, 20000)
|
|
501
|
+
combined = "\n".join(part for part in [run_result.stdout, run_result.stderr] if part)
|
|
502
|
+
return _truncate(combined, 20000)
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _submission_for_task(task: dict[str, Any], run_result: AgentRun) -> Any:
|
|
506
|
+
response = _response_text(run_result)
|
|
507
|
+
if _is_science_agent_task(task):
|
|
508
|
+
return response
|
|
509
|
+
if _is_appworld_task(task):
|
|
510
|
+
return "Completed" if run_result.returncode == 0 else response
|
|
511
|
+
|
|
512
|
+
updated = dict(task)
|
|
513
|
+
updated["response"] = response
|
|
514
|
+
return updated
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def run(input: dict[str, dict[str, Any]], **kwargs: Any) -> dict[str, Any]:
|
|
518
|
+
"""Run Cawdex for HAL.
|
|
519
|
+
|
|
520
|
+
Patch-style tasks return {task_id: patch}. ScienceAgentBench-style tasks
|
|
521
|
+
return a trajectory string. AppWorld-style tasks return "Completed" after
|
|
522
|
+
a successful run. Other text/code tasks return the original task dict with
|
|
523
|
+
a response field, matching HAL's USACO-style pattern.
|
|
524
|
+
"""
|
|
525
|
+
if not isinstance(input, dict):
|
|
526
|
+
raise TypeError("Cawdex HAL adapter expects input to be a dictionary")
|
|
527
|
+
|
|
528
|
+
patch_task_ids = [
|
|
529
|
+
str(task_id)
|
|
530
|
+
for task_id, task in input.items()
|
|
531
|
+
if isinstance(task, dict) and _is_patch_task(task)
|
|
532
|
+
]
|
|
533
|
+
if len(patch_task_ids) > 1:
|
|
534
|
+
raise ValueError("Cawdex HAL adapter expects one patch-style task per checked-out worktree")
|
|
535
|
+
|
|
536
|
+
output: dict[str, Any] = {}
|
|
537
|
+
for task_id, task in input.items():
|
|
538
|
+
if not isinstance(task, dict):
|
|
539
|
+
output[str(task_id)] = task
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
prompt = _build_prompt(str(task_id), task)
|
|
543
|
+
run_result = _run_cawdex(str(task_id), prompt, kwargs)
|
|
544
|
+
|
|
545
|
+
if _is_patch_task(task):
|
|
546
|
+
output[str(task_id)] = _collect_git_patch(run_result.trace_dir)
|
|
547
|
+
else:
|
|
548
|
+
output[str(task_id)] = _submission_for_task(task, run_result)
|
|
549
|
+
|
|
550
|
+
return output
|