cawdex 1.35.74 → 1.35.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +5 -5
  2. package/bin/anycode.js +2 -2
  3. package/bin/cawdex.js +408 -408
  4. package/bin/ecc-hooks.cjs +11 -11
  5. package/dist/agents-md.d.ts +31 -0
  6. package/dist/agents-md.js +340 -0
  7. package/dist/agents-md.js.map +1 -0
  8. package/dist/agents.js +1424 -1424
  9. package/dist/api.d.ts +1 -0
  10. package/dist/api.js +19 -14
  11. package/dist/api.js.map +1 -1
  12. package/dist/autonomous-loops.js +287 -287
  13. package/dist/benchmark-repos.d.ts +31 -0
  14. package/dist/benchmark-repos.js +234 -8
  15. package/dist/benchmark-repos.js.map +1 -1
  16. package/dist/command-palette.js +4 -2
  17. package/dist/command-palette.js.map +1 -1
  18. package/dist/compaction.js +8 -8
  19. package/dist/config.js +51 -36
  20. package/dist/config.js.map +1 -1
  21. package/dist/content-engine.js +543 -543
  22. package/dist/context-brief.d.ts +4 -0
  23. package/dist/context-brief.js +230 -0
  24. package/dist/context-brief.js.map +1 -0
  25. package/dist/cost-tracker.d.ts +33 -14
  26. package/dist/cost-tracker.js +81 -19
  27. package/dist/cost-tracker.js.map +1 -1
  28. package/dist/coverage.js +39 -39
  29. package/dist/docs-sync.js +98 -98
  30. package/dist/evaluation.js +452 -452
  31. package/dist/fixed-footer.d.ts +7 -1
  32. package/dist/fixed-footer.js +92 -18
  33. package/dist/fixed-footer.js.map +1 -1
  34. package/dist/git-workflow.js +49 -49
  35. package/dist/index.d.ts +2 -0
  36. package/dist/index.js +197 -65
  37. package/dist/index.js.map +1 -1
  38. package/dist/instant-artifact.d.ts +6 -0
  39. package/dist/instant-artifact.js +397 -0
  40. package/dist/instant-artifact.js.map +1 -0
  41. package/dist/live-queue.js +1 -1
  42. package/dist/live-queue.js.map +1 -1
  43. package/dist/model-aliases.d.ts +37 -0
  44. package/dist/model-aliases.js +203 -0
  45. package/dist/model-aliases.js.map +1 -0
  46. package/dist/orchestration.js +15 -15
  47. package/dist/permissions.d.ts +6 -0
  48. package/dist/permissions.js +53 -0
  49. package/dist/permissions.js.map +1 -1
  50. package/dist/pm2-manager.js +26 -26
  51. package/dist/query.d.ts +0 -1
  52. package/dist/query.js +74 -39
  53. package/dist/query.js.map +1 -1
  54. package/dist/refactor.js +87 -87
  55. package/dist/repo-command.js +7 -1
  56. package/dist/repo-command.js.map +1 -1
  57. package/dist/search-first.js +92 -92
  58. package/dist/skill-create.js +100 -100
  59. package/dist/stitch.js +1 -1
  60. package/dist/system-prompt.d.ts +2 -1
  61. package/dist/system-prompt.js +10 -5
  62. package/dist/system-prompt.js.map +1 -1
  63. package/dist/tools/github-repo-digest.d.ts +1 -1
  64. package/dist/tools/github-repo-digest.js +38 -6
  65. package/dist/tools/github-repo-digest.js.map +1 -1
  66. package/dist/types.d.ts +3 -0
  67. package/dist/types.js.map +1 -1
  68. package/dist/verification.js +55 -55
  69. package/package.json +1 -1
  70. package/resources/__init__.py +1 -1
  71. package/resources/exgentic/cawdex_agent/README.md +114 -114
  72. package/resources/exgentic/cawdex_agent/__init__.py +5 -5
  73. package/resources/exgentic/cawdex_agent/agent.py +605 -605
  74. package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
  75. package/resources/exgentic/cawdex_agent/setup.sh +21 -21
  76. package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
  77. package/resources/hal/cawdex_agent/README.md +24 -24
  78. package/resources/hal/cawdex_agent/__init__.py +1 -1
  79. package/resources/hal/cawdex_agent/main.py +550 -550
  80. package/resources/hal/cawdex_agent/requirements.txt +2 -2
  81. package/resources/kbench/cawdex_agent/README.md +107 -107
  82. package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
  83. package/resources/kbench/cawdex_agent/runner.mjs +753 -753
  84. package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
  85. package/resources/terminal_bench/__init__.py +1 -1
  86. package/resources/terminal_bench/cawdex_agent.py +174 -174
  87. package/resources/terminal_bench/setup.sh +121 -121
@@ -1,605 +1,605 @@
1
- """Exgentic/Open Agent Leaderboard adapter for Cawdex."""
2
-
3
- from __future__ import annotations
4
-
5
- import json
6
- import os
7
- import shlex
8
- import subprocess
9
- from pathlib import Path
10
- from typing import Any, ClassVar
11
-
12
- from pydantic import Field
13
-
14
- from exgentic.core.agent import Agent
15
- from exgentic.core.agent_instance import AgentInstance
16
- from exgentic.core.types import Action, ActionType, Observation
17
- from exgentic.utils.cost import UpdatableCostReport
18
-
19
- from .utils import (
20
- ActionPayload,
21
- extract_action_payload,
22
- fallback_exgentic_action_payload,
23
- fold_exgentic_history,
24
- json_dumps,
25
- repair_exgentic_action_payload,
26
- redact,
27
- safe_id,
28
- shortlist_exgentic_actions,
29
- truncate,
30
- )
31
-
32
-
33
- class CawdexAgent(Agent):
34
- """Host-side Exgentic config for Cawdex."""
35
-
36
- display_name: ClassVar[str] = "Cawdex"
37
- slug_name: ClassVar[str] = "cawdex_agent"
38
-
39
- model: str = "openrouter/free"
40
- provider: str | None = None
41
- command: str = Field(default_factory=lambda: os.environ.get("CAWDEX_EXGENTIC_COMMAND") or os.environ.get("CAWDEX_EXGENTIC_COMMAND", "cawdex"))
42
- permission: str = "yolo"
43
- max_steps: int = 50
44
- max_turns: int | None = None
45
- max_tokens: int | None = None
46
- context_window_tokens: int | None = None
47
- temperature: float | None = None
48
- output_format: str = "text"
49
- timeout_sec: int = 1800
50
- memory: bool = False
51
- workdir: str | None = None
52
- cawdex_home: str | None = None
53
- extra_args: list[str] = Field(default_factory=list)
54
- extra_env: dict[str, str] = Field(default_factory=dict)
55
-
56
- @classmethod
57
- def _get_instance_class(cls):
58
- return CawdexAgentInstance
59
-
60
- @classmethod
61
- def _get_instance_class_ref(cls) -> str:
62
- return f"{cls.__module__}:CawdexAgentInstance"
63
-
64
- @property
65
- def model_name(self) -> str: # type: ignore[override]
66
- return self.model
67
-
68
- def get_models_names(self) -> list[str]: # type: ignore[override]
69
- return [self.model]
70
-
71
- def _get_instance_kwargs(self, session_id: str) -> dict[str, Any]:
72
- return {
73
- "session_id": session_id,
74
- "model": self.model,
75
- "provider": self.provider,
76
- "command": self.command,
77
- "permission": self.permission,
78
- "max_steps": self.max_steps,
79
- "max_turns": self.max_turns,
80
- "max_tokens": self.max_tokens,
81
- "context_window_tokens": self.context_window_tokens,
82
- "temperature": self.temperature,
83
- "output_format": self.output_format,
84
- "timeout_sec": self.timeout_sec,
85
- "memory": self.memory,
86
- "workdir": self.workdir,
87
- "cawdex_home": self.cawdex_home,
88
- "extra_args": self.extra_args,
89
- "extra_env": self.extra_env,
90
- }
91
-
92
-
93
- class CawdexAgentInstance(AgentInstance):
94
- """Per-session Exgentic runtime that asks Cawdex for the next action."""
95
-
96
- def __init__(
97
- self,
98
- session_id: str,
99
- model: str = "openrouter/free",
100
- provider: str | None = None,
101
- command: str = "cawdex",
102
- permission: str = "yolo",
103
- max_steps: int = 50,
104
- max_turns: int | None = None,
105
- max_tokens: int | None = None,
106
- context_window_tokens: int | None = None,
107
- temperature: float | None = None,
108
- output_format: str = "text",
109
- timeout_sec: int = 1800,
110
- memory: bool = False,
111
- workdir: str | None = None,
112
- cawdex_home: str | None = None,
113
- extra_args: list[str] | None = None,
114
- extra_env: dict[str, str] | None = None,
115
- ) -> None:
116
- super().__init__(session_id=session_id)
117
- self.model = model
118
- self.provider = provider
119
- self.command = command
120
- self.permission = permission
121
- self.max_steps = max_steps
122
- self.max_turns = max_turns
123
- self.max_tokens = max_tokens
124
- self.context_window_tokens = context_window_tokens
125
- self.temperature = temperature
126
- self.output_format = output_format
127
- self.timeout_sec = timeout_sec
128
- self.memory = memory
129
- self.workdir = workdir
130
- self.cawdex_home = cawdex_home
131
- self.extra_args = list(extra_args or [])
132
- self.extra_env = dict(extra_env or {})
133
- self._step = 0
134
- self._history: list[dict[str, Any]] = []
135
- self._cost_usd = 0.0
136
-
137
- def react(self, observation: Observation | None) -> Action | None:
138
- if self._step >= int(self.max_steps or 0):
139
- return None
140
-
141
- if observation is not None and not _observation_is_empty(observation):
142
- self._history.append({"role": "observation", "content": _observation_to_data(observation)})
143
-
144
- self._step += 1
145
- prompt = self._build_prompt()
146
- run = self._run_cawdex(prompt)
147
- self._history.append(
148
- {
149
- "role": "cawdex",
150
- "returncode": run["returncode"],
151
- "stdout": truncate(run["stdout"], limit=16000),
152
- "stderr": truncate(run["stderr"], limit=8000),
153
- }
154
- )
155
-
156
- combined = "\n".join(part for part in [run["stdout"], run["stderr"]] if part)
157
- payload = extract_action_payload(combined)
158
- action = self._action_from_payload(payload) if payload is not None else None
159
- if action is not None:
160
- self._history.append({"role": "selected_action", "content": _single_action_to_data(action)})
161
- return action
162
-
163
- fallback = self._fallback_action(combined or "Cawdex produced no output")
164
- if fallback is not None:
165
- self._history.append({"role": "selected_action", "content": _single_action_to_data(fallback)})
166
- return fallback
167
-
168
- def get_cost(self) -> UpdatableCostReport:
169
- report = UpdatableCostReport.initialize_empty(model_name=self.model)
170
- if self._cost_usd:
171
- report.add_cost(self._cost_usd)
172
- return report
173
-
174
- def close(self) -> None:
175
- return None
176
-
177
- def _build_prompt(self) -> str:
178
- action_docs = [_action_type_to_doc(action) for action in getattr(self, "actions", [])]
179
- context = getattr(self, "context", {}) or {}
180
- task = getattr(self, "task", "")
181
- profile = _profile_for_exgentic(task, context, action_docs)
182
- action_names = [str(doc.get("name", "")) for doc in action_docs if doc.get("name")]
183
- action_shortlist = shortlist_exgentic_actions(
184
- action_docs,
185
- task=task,
186
- context=context,
187
- history=self._history,
188
- profile=profile,
189
- )
190
- lines = [
191
- f"/benchmark {profile} Exgentic task",
192
- "",
193
- "You are running inside Exgentic/Open Agent Leaderboard.",
194
- "Work from the current task, context, latest observation, and the available action schemas.",
195
- "Choose exactly one available action. Do not invent action names.",
196
- "Prefer the recommended action shortlist when it matches the latest observation; use the full schemas only when the current state clearly requires another available action.",
197
- "For shortlisted actions, include every required_argument_key; when available_required_hints lists an exact value from latest observation or context, copy that value into the matching argument.",
198
- "The benchmark may count malformed JSON, unknown action names, or schema-mismatched arguments as invalid actions.",
199
- "End your response with one JSON object on its own line using this exact shape:",
200
- '{"name":"<action name>","arguments":{}}',
201
- "",
202
- "If the benchmark exposes environment actions, return the next action to execute.",
203
- "If the task is complete, use a finish/message action when one is available.",
204
- _profile_guidance(profile),
205
- "",
206
- "## Task",
207
- truncate(task),
208
- "",
209
- "## Context",
210
- json_dumps(context),
211
- "",
212
- "## Recommended action shortlist",
213
- json_dumps(action_shortlist),
214
- "",
215
- "## Available action names",
216
- json_dumps(action_names),
217
- "",
218
- "## Available actions",
219
- json_dumps(action_docs),
220
- ]
221
- if self._history:
222
- lines.extend(["", "## Folded session state", json_dumps(fold_exgentic_history(self._history, profile=profile), limit=24000)])
223
- return "\n".join(lines)
224
-
225
- def _run_cawdex(self, prompt: str) -> dict[str, Any]:
226
- step_dir = self.paths.agent_dir / "cawdex" / f"step-{self._step:03d}"
227
- trace_dir = step_dir / "trace"
228
- step_dir.mkdir(parents=True, exist_ok=True)
229
- trace_dir.mkdir(parents=True, exist_ok=True)
230
- prompt_path = step_dir / "prompt.txt"
231
- prompt_path.write_text(prompt, encoding="utf-8")
232
-
233
- args = _split_command(self.command)
234
- args.extend(["--prompt-file", str(prompt_path), "--perm", self.permission, "--benchmark-trace-dir", str(trace_dir)])
235
- _append_flag(args, "--model", self.model)
236
- _append_flag(args, "--provider", self.provider)
237
- _append_flag(args, "--max-turns", self.max_turns)
238
- _append_flag(args, "--max-tokens", self.max_tokens)
239
- _append_flag(args, "--context-window-tokens", self.context_window_tokens)
240
- _append_flag(args, "--temperature", self.temperature)
241
- _append_flag(args, "--output-format", self.output_format)
242
- args.extend(self.extra_args)
243
-
244
- env = os.environ.copy()
245
- env.update({str(key): str(value) for key, value in self.extra_env.items()})
246
- env.setdefault("CAWDEX_ENV_CONFIG", "1")
247
- env.setdefault("CAWDEX_THEME", "minimal")
248
- env.setdefault("CAWDEX_SHOW_THINKING", "0")
249
- env.setdefault("CAWDEX_BASH_TIMEOUT_MS", "300000")
250
- env["CAWDEX_MEMORY"] = "1" if self.memory else "0"
251
- if self.cawdex_home:
252
- env["CAWDEX_HOME"] = self.cawdex_home
253
-
254
- cwd = self._resolve_workdir()
255
- try:
256
- completed = subprocess.run(
257
- args,
258
- cwd=str(cwd),
259
- env=env,
260
- text=True,
261
- capture_output=True,
262
- timeout=self.timeout_sec,
263
- check=False,
264
- )
265
- stdout = redact(completed.stdout)
266
- stderr = redact(completed.stderr)
267
- returncode = completed.returncode
268
- except subprocess.TimeoutExpired as exc:
269
- stdout = redact(exc.stdout)
270
- stderr = redact(exc.stderr) + f"\ncawdex timed out after {self.timeout_sec}s"
271
- returncode = 124
272
- except Exception as exc:
273
- stdout = ""
274
- stderr = f"cawdex launch failed: {redact(exc)}"
275
- returncode = 127
276
-
277
- (step_dir / "argv.json").write_text(
278
- json.dumps([redact(arg) for arg in args], ensure_ascii=False, indent=2),
279
- encoding="utf-8",
280
- )
281
- (step_dir / "stdout.txt").write_text(stdout, encoding="utf-8")
282
- (step_dir / "stderr.txt").write_text(stderr, encoding="utf-8")
283
- self._load_cost(trace_dir)
284
- return {"returncode": returncode, "stdout": stdout, "stderr": stderr, "trace_dir": str(trace_dir)}
285
-
286
- def _load_cost(self, trace_dir: Path) -> None:
287
- summaries = sorted(trace_dir.rglob("summary.json"), key=lambda item: item.stat().st_mtime)
288
- if not summaries:
289
- return
290
- try:
291
- summary = json.loads(summaries[-1].read_text(encoding="utf-8"))
292
- except Exception:
293
- return
294
- usage = summary.get("usage") if isinstance(summary, dict) else None
295
- if not isinstance(usage, dict):
296
- return
297
- try:
298
- self._cost_usd += float(usage.get("estimatedCostUsd") or 0.0)
299
- except Exception:
300
- return
301
-
302
- def _resolve_workdir(self) -> Path:
303
- if self.workdir:
304
- return Path(self.workdir).expanduser()
305
- context = getattr(self, "context", {}) or {}
306
- for key in ("workdir", "working_dir", "workspace", "repo_path", "cwd"):
307
- value = context.get(key)
308
- if isinstance(value, str) and value.strip():
309
- return Path(value).expanduser()
310
- return Path.cwd()
311
-
312
- def _action_from_payload(self, payload: ActionPayload) -> Action | None:
313
- actions = list(getattr(self, "actions", []) or [])
314
- action_docs = [_action_type_to_doc(action) for action in actions]
315
- repair = repair_exgentic_action_payload(
316
- payload,
317
- action_docs,
318
- argument_hints={
319
- "latest_observation": self._latest_observation_data(),
320
- "context": getattr(self, "context", {}) or {},
321
- },
322
- )
323
- if repair.diagnostics.get("status") != "unchanged":
324
- self._history.append({"role": "action_repair", "content": repair.diagnostics})
325
-
326
- repaired_payload = repair.payload
327
- action_type = _find_action_type(actions, repaired_payload.name)
328
- if action_type is None:
329
- return None
330
- args = _normalize_arguments(action_type, repaired_payload.arguments, fallback_text=json_dumps(repaired_payload.arguments))
331
- try:
332
- return action_type.build_action(args)
333
- except Exception as exc:
334
- self._history.append(
335
- {
336
- "role": "action_repair",
337
- "content": {
338
- "status": "build_failed",
339
- "action": repaired_payload.name,
340
- "error": truncate(exc, limit=1200),
341
- },
342
- }
343
- )
344
- return None
345
-
346
- def _fallback_action(self, text: str) -> Action | None:
347
- actions = list(getattr(self, "actions", []) or [])
348
- if not actions:
349
- return None
350
- action_docs = [_action_type_to_doc(action) for action in actions]
351
- profile = _profile_for_exgentic(getattr(self, "task", ""), getattr(self, "context", {}) or {}, action_docs)
352
- fallback = fallback_exgentic_action_payload(
353
- action_docs,
354
- task=getattr(self, "task", ""),
355
- context=getattr(self, "context", {}) or {},
356
- history=self._history,
357
- profile=profile,
358
- reason="no_valid_action_json",
359
- )
360
- if fallback is not None:
361
- self._history.append({"role": "action_repair", "content": fallback.diagnostics})
362
- action = self._action_from_payload(fallback.payload)
363
- if action is not None:
364
- return action
365
-
366
- preferred = _first_matching_action(actions, lambda action: bool(getattr(action, "is_finish", False)))
367
- if preferred is None:
368
- preferred = _first_matching_action(actions, lambda action: bool(getattr(action, "is_message", False)))
369
- if preferred is None:
370
- preferred = _first_matching_action(actions, lambda action: action.name.lower() in {"finish", "final", "done"})
371
- if preferred is None and len(actions) == 1:
372
- preferred = actions[0]
373
- if preferred is None:
374
- return None
375
- args = _normalize_arguments(preferred, {}, fallback_text=truncate(text, limit=20000))
376
- try:
377
- return preferred.build_action(args)
378
- except Exception:
379
- return None
380
-
381
- def _latest_observation_data(self) -> Any:
382
- for item in reversed(self._history):
383
- if isinstance(item, dict) and item.get("role") == "observation":
384
- return item.get("content")
385
- return None
386
-
387
-
388
- def _split_command(command: str) -> list[str]:
389
- parts = shlex.split(command, posix=os.name != "nt")
390
- return parts or ["cawdex"]
391
-
392
-
393
- def _append_flag(args: list[str], flag: str, value: Any) -> None:
394
- if value is None:
395
- return
396
- text = str(value).strip()
397
- if not text:
398
- return
399
- args.extend([flag, text])
400
-
401
-
402
- def _action_type_to_doc(action: ActionType) -> dict[str, Any]:
403
- args_type = getattr(action, "arguments", None)
404
- schema: Any = None
405
- if args_type is not None:
406
- try:
407
- schema = args_type.model_json_schema()
408
- except Exception:
409
- schema = str(args_type)
410
- return {
411
- "name": action.name,
412
- "description": getattr(action, "description", ""),
413
- "is_finish": bool(getattr(action, "is_finish", False)),
414
- "is_message": bool(getattr(action, "is_message", False)),
415
- "arguments_schema": schema,
416
- }
417
-
418
-
419
- def _profile_for_exgentic(task: Any, context: Any, action_docs: list[dict[str, Any]]) -> str:
420
- text = " ".join(
421
- [
422
- str(task or ""),
423
- json.dumps(context or {}, ensure_ascii=False, default=str),
424
- json.dumps(action_docs or [], ensure_ascii=False, default=str),
425
- ]
426
- ).lower()
427
- if any(token in text for token in ("appworld", "app-world", "app world")):
428
- return "appworld"
429
- if any(token in text for token in ("browsecomp", "browsecomp+", "browse-comp", "deep research", "web research")):
430
- return "browsecomp"
431
- if any(token in text for token in ("tau2", "tau 2", "tau-bench", "tau_bench", "taubench", "customer support", "customer-service")):
432
- return "tau2"
433
- if any(token in text for token in ("terminalworld", "terminal-world", "tw_", "asciinema")) or (
434
- "instruction.md" in text and "solve.sh" in text
435
- ):
436
- return "terminalworld"
437
- if any(token in text for token in ("swe-bench mobile", "xcode", "swift", "objective-c", "simulator", "figma")):
438
- return "swe-bench-mobile"
439
- if any(token in text for token in ("swe-webdevbench", "swe-webdev-bench", "webdevbench", "webdev-bench", "vibe coding", "virtual software agency", "canary requirement", "frontend-backend", "production readiness")):
440
- return "webdevbench"
441
- if any(token in text for token in ("swe-cycle", "swecycle", "swe cycle", "swe-judge", "swejudge", "fullcycle", "codeimpl", "testgen", "run_script", "parsing_script", "selected_test_files_to_run", "environment_setup_commit", "before_repo_set_cmd", "bare repository")):
442
- return "swe-cycle"
443
- if any(token in text for token in ("swe-ci", "sweci", "swe ci", "run_tests", "define_requirements", "modify_code", "test gap", "current_sha", "target_sha", "ci-loop", "continuous integration loop")):
444
- return "swe-ci"
445
- if any(token in text for token in ("swe-prbench", "swe prbench", "swe-pr", "prbench", "pull request review", "code review quality", "human_review_comments", "diff_patch", "type2_contextual")):
446
- return "swe-prbench"
447
- if any(token in text for token in ("tml-bench", "tmlbench", "tabular ml", "kaggle-style", "kaggle style", "sample_submission", "private holdout", "train.csv", "test.csv")):
448
- return "tml-bench"
449
- if any(token in text for token in ("pi-bench", "pibench", "proactive personal assistant", "proactive assistant", "hidden intent", "latent intent", "user profile", "message history", "current app", "proactivity score", "completion score")):
450
- return "pi-bench"
451
- if any(token in text for token in ("saasbench", "saas-bench", "enterprise saas", "tenant", "migration")):
452
- return "saasbench"
453
- if any(token in text for token in ("roadmapbench", "roadmap-bench", "long-horizon", "version upgrade")):
454
- return "roadmapbench"
455
- if any(token in text for token in ("arc-agi", "arc prize", "kaggle arc")):
456
- return "arc-agi"
457
- return "generic"
458
-
459
-
460
- def _profile_guidance(profile: str) -> str:
461
- if profile == "appworld":
462
- return "AppWorld discipline: track app/API state from observations, preserve record IDs and permissions, and finish only after the requested state change is confirmed."
463
- if profile == "browsecomp":
464
- return "BrowseComp+ discipline: decompose the research question, prefer primary/high-authority sources, cross-check facts, and include auditable source attribution in finish/message arguments."
465
- if profile == "tau2":
466
- return "tau2 discipline: read policy/context first, take only policy-supported tool actions, and confirm observations before promising customer outcomes."
467
- if profile == "terminalworld":
468
- return "TerminalWorld discipline: extract required artifacts from instruction.md/task text, avoid solve.sh/reference material, execute real CLI steps, and verify persistent files/services before finishing."
469
- if profile == "swe-bench-mobile":
470
- return "Mobile discipline: respect PRD/design/platform constraints and prefer platform validation evidence when the harness exposes it."
471
- if profile == "webdevbench":
472
- return "WebDevBench discipline: preserve canary business requirements, verify frontend-backend coupling, and seek production/security evidence before completion."
473
- if profile == "swe-cycle":
474
- return "SWE-Cycle discipline: carry lifecycle phase, environment setup state, implementation requirements, generated/selected tests, and static/dynamic judge evidence through each action."
475
- if profile == "swe-ci":
476
- return "SWE-CI discipline: carry current/target commits, test gaps, inferred requirements, code changes, and CI-loop validation deltas through each action."
477
- if profile == "swe-prbench":
478
- return "SWE-PRBench discipline: review diff first, expand only to evidence-needed context, and produce severity-rated findings with file/line evidence rather than patching unless explicitly requested."
479
- if profile == "tml-bench":
480
- return "TML-Bench discipline: establish data contract, avoid hidden-label leakage, validate an honest baseline, and produce a schema-valid submission artifact."
481
- if profile == "pi-bench":
482
- return "Pi-Bench discipline: build the user/workspace/app context contract, infer hidden intent with evidence, ask only necessary clarifying questions, and verify state after proactive actions."
483
- if profile == "saasbench":
484
- return "SaaS discipline: preserve tenant, auth, migration, and cross-component workflow integrity."
485
- if profile == "roadmapbench":
486
- return "Roadmap discipline: keep milestones explicit and avoid claiming completion while roadmap items remain unverified."
487
- if profile == "arc-agi":
488
- return "ARC discipline: infer environment dynamics with small experiments and avoid hardcoding hidden answers."
489
- return "Generic discipline: use the available actions exactly, observe after state-changing actions, and finish only with benchmark-visible evidence."
490
-
491
-
492
- def _find_action_type(actions: list[ActionType], name: str) -> ActionType | None:
493
- for action in actions:
494
- if action.name == name:
495
- return action
496
- lowered = name.lower()
497
- for action in actions:
498
- if action.name.lower() == lowered:
499
- return action
500
- return None
501
-
502
-
503
- def _first_matching_action(actions: list[ActionType], predicate: Any) -> ActionType | None:
504
- for action in actions:
505
- if predicate(action):
506
- return action
507
- return None
508
-
509
-
510
- def _normalize_arguments(action: ActionType, provided: dict[str, Any], fallback_text: str) -> dict[str, Any]:
511
- args = dict(provided or {})
512
- fields = _argument_fields(action)
513
- if not fields:
514
- return args
515
- if any(key in args for key in fields):
516
- return args
517
-
518
- for key in ("answer", "final_answer", "response", "content", "message", "text", "result", "output"):
519
- if key in fields:
520
- args[key] = fallback_text
521
- return args
522
-
523
- for key, field in fields.items():
524
- if _field_required(field):
525
- args[key] = _fallback_value_for_field(field, fallback_text)
526
- return args
527
- return args
528
-
529
-
530
- def _argument_fields(action: ActionType) -> dict[str, Any]:
531
- args_type = getattr(action, "arguments", None)
532
- return dict(getattr(args_type, "model_fields", {}) or getattr(args_type, "__fields__", {}) or {})
533
-
534
-
535
- def _field_required(field: Any) -> bool:
536
- method = getattr(field, "is_required", None)
537
- if callable(method):
538
- return bool(method())
539
- return bool(getattr(field, "required", False))
540
-
541
-
542
- def _fallback_value_for_field(field: Any, text: str) -> Any:
543
- annotation = getattr(field, "annotation", None) or getattr(field, "type_", None)
544
- if annotation is bool:
545
- return False
546
- if annotation is int:
547
- return 0
548
- if annotation is float:
549
- return 0.0
550
- if annotation is list:
551
- return []
552
- if annotation is dict:
553
- return {}
554
- return text
555
-
556
-
557
- def _observation_is_empty(observation: Observation) -> bool:
558
- try:
559
- return bool(observation.is_empty())
560
- except Exception:
561
- return False
562
-
563
-
564
- def _observation_to_data(observation: Observation) -> Any:
565
- try:
566
- items = observation.to_observation_list()
567
- except Exception:
568
- return str(observation)
569
- data: list[Any] = []
570
- for item in items:
571
- result = getattr(item, "result", item)
572
- data.append(result)
573
- return data
574
-
575
-
576
- def _single_action_to_data(action: Action) -> Any:
577
- try:
578
- values = []
579
- for item in action.to_action_list():
580
- args = getattr(item, "arguments", {})
581
- if hasattr(args, "model_dump"):
582
- args = args.model_dump()
583
- values.append({"name": getattr(item, "name", ""), "arguments": args, "id": getattr(item, "id", "")})
584
- return values
585
- except Exception:
586
- return {"id": safe_id(str(action)), "text": str(action)}
587
-
588
-
589
- class CawdexAgentInstance(CawdexAgentInstance):
590
- """Preferred Exgentic runtime class name for Cawdex."""
591
-
592
-
593
- class CawdexAgent(CawdexAgent):
594
- """Preferred Exgentic host class name for Cawdex."""
595
-
596
- display_name: ClassVar[str] = "Cawdex"
597
- slug_name: ClassVar[str] = "cawdex_agent"
598
-
599
- @classmethod
600
- def _get_instance_class(cls):
601
- return CawdexAgentInstance
602
-
603
- @classmethod
604
- def _get_instance_class_ref(cls) -> str:
605
- return f"{cls.__module__}:CawdexAgentInstance"
1
+ """Exgentic/Open Agent Leaderboard adapter for Cawdex."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import shlex
8
+ import subprocess
9
+ from pathlib import Path
10
+ from typing import Any, ClassVar
11
+
12
+ from pydantic import Field
13
+
14
+ from exgentic.core.agent import Agent
15
+ from exgentic.core.agent_instance import AgentInstance
16
+ from exgentic.core.types import Action, ActionType, Observation
17
+ from exgentic.utils.cost import UpdatableCostReport
18
+
19
+ from .utils import (
20
+ ActionPayload,
21
+ extract_action_payload,
22
+ fallback_exgentic_action_payload,
23
+ fold_exgentic_history,
24
+ json_dumps,
25
+ repair_exgentic_action_payload,
26
+ redact,
27
+ safe_id,
28
+ shortlist_exgentic_actions,
29
+ truncate,
30
+ )
31
+
32
+
33
+ class CawdexAgent(Agent):
34
+ """Host-side Exgentic config for Cawdex."""
35
+
36
+ display_name: ClassVar[str] = "Cawdex"
37
+ slug_name: ClassVar[str] = "cawdex_agent"
38
+
39
+ model: str = "openrouter/free"
40
+ provider: str | None = None
41
+ command: str = Field(default_factory=lambda: os.environ.get("CAWDEX_EXGENTIC_COMMAND") or os.environ.get("CAWDEX_EXGENTIC_COMMAND", "cawdex"))
42
+ permission: str = "yolo"
43
+ max_steps: int = 50
44
+ max_turns: int | None = None
45
+ max_tokens: int | None = None
46
+ context_window_tokens: int | None = None
47
+ temperature: float | None = None
48
+ output_format: str = "text"
49
+ timeout_sec: int = 1800
50
+ memory: bool = False
51
+ workdir: str | None = None
52
+ cawdex_home: str | None = None
53
+ extra_args: list[str] = Field(default_factory=list)
54
+ extra_env: dict[str, str] = Field(default_factory=dict)
55
+
56
+ @classmethod
57
+ def _get_instance_class(cls):
58
+ return CawdexAgentInstance
59
+
60
+ @classmethod
61
+ def _get_instance_class_ref(cls) -> str:
62
+ return f"{cls.__module__}:CawdexAgentInstance"
63
+
64
+ @property
65
+ def model_name(self) -> str: # type: ignore[override]
66
+ return self.model
67
+
68
+ def get_models_names(self) -> list[str]: # type: ignore[override]
69
+ return [self.model]
70
+
71
+ def _get_instance_kwargs(self, session_id: str) -> dict[str, Any]:
72
+ return {
73
+ "session_id": session_id,
74
+ "model": self.model,
75
+ "provider": self.provider,
76
+ "command": self.command,
77
+ "permission": self.permission,
78
+ "max_steps": self.max_steps,
79
+ "max_turns": self.max_turns,
80
+ "max_tokens": self.max_tokens,
81
+ "context_window_tokens": self.context_window_tokens,
82
+ "temperature": self.temperature,
83
+ "output_format": self.output_format,
84
+ "timeout_sec": self.timeout_sec,
85
+ "memory": self.memory,
86
+ "workdir": self.workdir,
87
+ "cawdex_home": self.cawdex_home,
88
+ "extra_args": self.extra_args,
89
+ "extra_env": self.extra_env,
90
+ }
91
+
92
+
93
+ class CawdexAgentInstance(AgentInstance):
94
+ """Per-session Exgentic runtime that asks Cawdex for the next action."""
95
+
96
+ def __init__(
97
+ self,
98
+ session_id: str,
99
+ model: str = "openrouter/free",
100
+ provider: str | None = None,
101
+ command: str = "cawdex",
102
+ permission: str = "yolo",
103
+ max_steps: int = 50,
104
+ max_turns: int | None = None,
105
+ max_tokens: int | None = None,
106
+ context_window_tokens: int | None = None,
107
+ temperature: float | None = None,
108
+ output_format: str = "text",
109
+ timeout_sec: int = 1800,
110
+ memory: bool = False,
111
+ workdir: str | None = None,
112
+ cawdex_home: str | None = None,
113
+ extra_args: list[str] | None = None,
114
+ extra_env: dict[str, str] | None = None,
115
+ ) -> None:
116
+ super().__init__(session_id=session_id)
117
+ self.model = model
118
+ self.provider = provider
119
+ self.command = command
120
+ self.permission = permission
121
+ self.max_steps = max_steps
122
+ self.max_turns = max_turns
123
+ self.max_tokens = max_tokens
124
+ self.context_window_tokens = context_window_tokens
125
+ self.temperature = temperature
126
+ self.output_format = output_format
127
+ self.timeout_sec = timeout_sec
128
+ self.memory = memory
129
+ self.workdir = workdir
130
+ self.cawdex_home = cawdex_home
131
+ self.extra_args = list(extra_args or [])
132
+ self.extra_env = dict(extra_env or {})
133
+ self._step = 0
134
+ self._history: list[dict[str, Any]] = []
135
+ self._cost_usd = 0.0
136
+
137
+ def react(self, observation: Observation | None) -> Action | None:
138
+ if self._step >= int(self.max_steps or 0):
139
+ return None
140
+
141
+ if observation is not None and not _observation_is_empty(observation):
142
+ self._history.append({"role": "observation", "content": _observation_to_data(observation)})
143
+
144
+ self._step += 1
145
+ prompt = self._build_prompt()
146
+ run = self._run_cawdex(prompt)
147
+ self._history.append(
148
+ {
149
+ "role": "cawdex",
150
+ "returncode": run["returncode"],
151
+ "stdout": truncate(run["stdout"], limit=16000),
152
+ "stderr": truncate(run["stderr"], limit=8000),
153
+ }
154
+ )
155
+
156
+ combined = "\n".join(part for part in [run["stdout"], run["stderr"]] if part)
157
+ payload = extract_action_payload(combined)
158
+ action = self._action_from_payload(payload) if payload is not None else None
159
+ if action is not None:
160
+ self._history.append({"role": "selected_action", "content": _single_action_to_data(action)})
161
+ return action
162
+
163
+ fallback = self._fallback_action(combined or "Cawdex produced no output")
164
+ if fallback is not None:
165
+ self._history.append({"role": "selected_action", "content": _single_action_to_data(fallback)})
166
+ return fallback
167
+
168
+ def get_cost(self) -> UpdatableCostReport:
169
+ report = UpdatableCostReport.initialize_empty(model_name=self.model)
170
+ if self._cost_usd:
171
+ report.add_cost(self._cost_usd)
172
+ return report
173
+
174
+ def close(self) -> None:
175
+ return None
176
+
177
+ def _build_prompt(self) -> str:
178
+ action_docs = [_action_type_to_doc(action) for action in getattr(self, "actions", [])]
179
+ context = getattr(self, "context", {}) or {}
180
+ task = getattr(self, "task", "")
181
+ profile = _profile_for_exgentic(task, context, action_docs)
182
+ action_names = [str(doc.get("name", "")) for doc in action_docs if doc.get("name")]
183
+ action_shortlist = shortlist_exgentic_actions(
184
+ action_docs,
185
+ task=task,
186
+ context=context,
187
+ history=self._history,
188
+ profile=profile,
189
+ )
190
+ lines = [
191
+ f"/benchmark {profile} Exgentic task",
192
+ "",
193
+ "You are running inside Exgentic/Open Agent Leaderboard.",
194
+ "Work from the current task, context, latest observation, and the available action schemas.",
195
+ "Choose exactly one available action. Do not invent action names.",
196
+ "Prefer the recommended action shortlist when it matches the latest observation; use the full schemas only when the current state clearly requires another available action.",
197
+ "For shortlisted actions, include every required_argument_key; when available_required_hints lists an exact value from latest observation or context, copy that value into the matching argument.",
198
+ "The benchmark may count malformed JSON, unknown action names, or schema-mismatched arguments as invalid actions.",
199
+ "End your response with one JSON object on its own line using this exact shape:",
200
+ '{"name":"<action name>","arguments":{}}',
201
+ "",
202
+ "If the benchmark exposes environment actions, return the next action to execute.",
203
+ "If the task is complete, use a finish/message action when one is available.",
204
+ _profile_guidance(profile),
205
+ "",
206
+ "## Task",
207
+ truncate(task),
208
+ "",
209
+ "## Context",
210
+ json_dumps(context),
211
+ "",
212
+ "## Recommended action shortlist",
213
+ json_dumps(action_shortlist),
214
+ "",
215
+ "## Available action names",
216
+ json_dumps(action_names),
217
+ "",
218
+ "## Available actions",
219
+ json_dumps(action_docs),
220
+ ]
221
+ if self._history:
222
+ lines.extend(["", "## Folded session state", json_dumps(fold_exgentic_history(self._history, profile=profile), limit=24000)])
223
+ return "\n".join(lines)
224
+
225
+ def _run_cawdex(self, prompt: str) -> dict[str, Any]:
226
+ step_dir = self.paths.agent_dir / "cawdex" / f"step-{self._step:03d}"
227
+ trace_dir = step_dir / "trace"
228
+ step_dir.mkdir(parents=True, exist_ok=True)
229
+ trace_dir.mkdir(parents=True, exist_ok=True)
230
+ prompt_path = step_dir / "prompt.txt"
231
+ prompt_path.write_text(prompt, encoding="utf-8")
232
+
233
+ args = _split_command(self.command)
234
+ args.extend(["--prompt-file", str(prompt_path), "--perm", self.permission, "--benchmark-trace-dir", str(trace_dir)])
235
+ _append_flag(args, "--model", self.model)
236
+ _append_flag(args, "--provider", self.provider)
237
+ _append_flag(args, "--max-turns", self.max_turns)
238
+ _append_flag(args, "--max-tokens", self.max_tokens)
239
+ _append_flag(args, "--context-window-tokens", self.context_window_tokens)
240
+ _append_flag(args, "--temperature", self.temperature)
241
+ _append_flag(args, "--output-format", self.output_format)
242
+ args.extend(self.extra_args)
243
+
244
+ env = os.environ.copy()
245
+ env.update({str(key): str(value) for key, value in self.extra_env.items()})
246
+ env.setdefault("CAWDEX_ENV_CONFIG", "1")
247
+ env.setdefault("CAWDEX_THEME", "minimal")
248
+ env.setdefault("CAWDEX_SHOW_THINKING", "0")
249
+ env.setdefault("CAWDEX_BASH_TIMEOUT_MS", "300000")
250
+ env["CAWDEX_MEMORY"] = "1" if self.memory else "0"
251
+ if self.cawdex_home:
252
+ env["CAWDEX_HOME"] = self.cawdex_home
253
+
254
+ cwd = self._resolve_workdir()
255
+ try:
256
+ completed = subprocess.run(
257
+ args,
258
+ cwd=str(cwd),
259
+ env=env,
260
+ text=True,
261
+ capture_output=True,
262
+ timeout=self.timeout_sec,
263
+ check=False,
264
+ )
265
+ stdout = redact(completed.stdout)
266
+ stderr = redact(completed.stderr)
267
+ returncode = completed.returncode
268
+ except subprocess.TimeoutExpired as exc:
269
+ stdout = redact(exc.stdout)
270
+ stderr = redact(exc.stderr) + f"\ncawdex timed out after {self.timeout_sec}s"
271
+ returncode = 124
272
+ except Exception as exc:
273
+ stdout = ""
274
+ stderr = f"cawdex launch failed: {redact(exc)}"
275
+ returncode = 127
276
+
277
+ (step_dir / "argv.json").write_text(
278
+ json.dumps([redact(arg) for arg in args], ensure_ascii=False, indent=2),
279
+ encoding="utf-8",
280
+ )
281
+ (step_dir / "stdout.txt").write_text(stdout, encoding="utf-8")
282
+ (step_dir / "stderr.txt").write_text(stderr, encoding="utf-8")
283
+ self._load_cost(trace_dir)
284
+ return {"returncode": returncode, "stdout": stdout, "stderr": stderr, "trace_dir": str(trace_dir)}
285
+
286
+ def _load_cost(self, trace_dir: Path) -> None:
287
+ summaries = sorted(trace_dir.rglob("summary.json"), key=lambda item: item.stat().st_mtime)
288
+ if not summaries:
289
+ return
290
+ try:
291
+ summary = json.loads(summaries[-1].read_text(encoding="utf-8"))
292
+ except Exception:
293
+ return
294
+ usage = summary.get("usage") if isinstance(summary, dict) else None
295
+ if not isinstance(usage, dict):
296
+ return
297
+ try:
298
+ self._cost_usd += float(usage.get("estimatedCostUsd") or 0.0)
299
+ except Exception:
300
+ return
301
+
302
+ def _resolve_workdir(self) -> Path:
303
+ if self.workdir:
304
+ return Path(self.workdir).expanduser()
305
+ context = getattr(self, "context", {}) or {}
306
+ for key in ("workdir", "working_dir", "workspace", "repo_path", "cwd"):
307
+ value = context.get(key)
308
+ if isinstance(value, str) and value.strip():
309
+ return Path(value).expanduser()
310
+ return Path.cwd()
311
+
312
+ def _action_from_payload(self, payload: ActionPayload) -> Action | None:
313
+ actions = list(getattr(self, "actions", []) or [])
314
+ action_docs = [_action_type_to_doc(action) for action in actions]
315
+ repair = repair_exgentic_action_payload(
316
+ payload,
317
+ action_docs,
318
+ argument_hints={
319
+ "latest_observation": self._latest_observation_data(),
320
+ "context": getattr(self, "context", {}) or {},
321
+ },
322
+ )
323
+ if repair.diagnostics.get("status") != "unchanged":
324
+ self._history.append({"role": "action_repair", "content": repair.diagnostics})
325
+
326
+ repaired_payload = repair.payload
327
+ action_type = _find_action_type(actions, repaired_payload.name)
328
+ if action_type is None:
329
+ return None
330
+ args = _normalize_arguments(action_type, repaired_payload.arguments, fallback_text=json_dumps(repaired_payload.arguments))
331
+ try:
332
+ return action_type.build_action(args)
333
+ except Exception as exc:
334
+ self._history.append(
335
+ {
336
+ "role": "action_repair",
337
+ "content": {
338
+ "status": "build_failed",
339
+ "action": repaired_payload.name,
340
+ "error": truncate(exc, limit=1200),
341
+ },
342
+ }
343
+ )
344
+ return None
345
+
346
+ def _fallback_action(self, text: str) -> Action | None:
347
+ actions = list(getattr(self, "actions", []) or [])
348
+ if not actions:
349
+ return None
350
+ action_docs = [_action_type_to_doc(action) for action in actions]
351
+ profile = _profile_for_exgentic(getattr(self, "task", ""), getattr(self, "context", {}) or {}, action_docs)
352
+ fallback = fallback_exgentic_action_payload(
353
+ action_docs,
354
+ task=getattr(self, "task", ""),
355
+ context=getattr(self, "context", {}) or {},
356
+ history=self._history,
357
+ profile=profile,
358
+ reason="no_valid_action_json",
359
+ )
360
+ if fallback is not None:
361
+ self._history.append({"role": "action_repair", "content": fallback.diagnostics})
362
+ action = self._action_from_payload(fallback.payload)
363
+ if action is not None:
364
+ return action
365
+
366
+ preferred = _first_matching_action(actions, lambda action: bool(getattr(action, "is_finish", False)))
367
+ if preferred is None:
368
+ preferred = _first_matching_action(actions, lambda action: bool(getattr(action, "is_message", False)))
369
+ if preferred is None:
370
+ preferred = _first_matching_action(actions, lambda action: action.name.lower() in {"finish", "final", "done"})
371
+ if preferred is None and len(actions) == 1:
372
+ preferred = actions[0]
373
+ if preferred is None:
374
+ return None
375
+ args = _normalize_arguments(preferred, {}, fallback_text=truncate(text, limit=20000))
376
+ try:
377
+ return preferred.build_action(args)
378
+ except Exception:
379
+ return None
380
+
381
+ def _latest_observation_data(self) -> Any:
382
+ for item in reversed(self._history):
383
+ if isinstance(item, dict) and item.get("role") == "observation":
384
+ return item.get("content")
385
+ return None
386
+
387
+
388
+ def _split_command(command: str) -> list[str]:
389
+ parts = shlex.split(command, posix=os.name != "nt")
390
+ return parts or ["cawdex"]
391
+
392
+
393
+ def _append_flag(args: list[str], flag: str, value: Any) -> None:
394
+ if value is None:
395
+ return
396
+ text = str(value).strip()
397
+ if not text:
398
+ return
399
+ args.extend([flag, text])
400
+
401
+
402
+ def _action_type_to_doc(action: ActionType) -> dict[str, Any]:
403
+ args_type = getattr(action, "arguments", None)
404
+ schema: Any = None
405
+ if args_type is not None:
406
+ try:
407
+ schema = args_type.model_json_schema()
408
+ except Exception:
409
+ schema = str(args_type)
410
+ return {
411
+ "name": action.name,
412
+ "description": getattr(action, "description", ""),
413
+ "is_finish": bool(getattr(action, "is_finish", False)),
414
+ "is_message": bool(getattr(action, "is_message", False)),
415
+ "arguments_schema": schema,
416
+ }
417
+
418
+
419
+ def _profile_for_exgentic(task: Any, context: Any, action_docs: list[dict[str, Any]]) -> str:
420
+ text = " ".join(
421
+ [
422
+ str(task or ""),
423
+ json.dumps(context or {}, ensure_ascii=False, default=str),
424
+ json.dumps(action_docs or [], ensure_ascii=False, default=str),
425
+ ]
426
+ ).lower()
427
+ if any(token in text for token in ("appworld", "app-world", "app world")):
428
+ return "appworld"
429
+ if any(token in text for token in ("browsecomp", "browsecomp+", "browse-comp", "deep research", "web research")):
430
+ return "browsecomp"
431
+ if any(token in text for token in ("tau2", "tau 2", "tau-bench", "tau_bench", "taubench", "customer support", "customer-service")):
432
+ return "tau2"
433
+ if any(token in text for token in ("terminalworld", "terminal-world", "tw_", "asciinema")) or (
434
+ "instruction.md" in text and "solve.sh" in text
435
+ ):
436
+ return "terminalworld"
437
+ if any(token in text for token in ("swe-bench mobile", "xcode", "swift", "objective-c", "simulator", "figma")):
438
+ return "swe-bench-mobile"
439
+ if any(token in text for token in ("swe-webdevbench", "swe-webdev-bench", "webdevbench", "webdev-bench", "vibe coding", "virtual software agency", "canary requirement", "frontend-backend", "production readiness")):
440
+ return "webdevbench"
441
+ if any(token in text for token in ("swe-cycle", "swecycle", "swe cycle", "swe-judge", "swejudge", "fullcycle", "codeimpl", "testgen", "run_script", "parsing_script", "selected_test_files_to_run", "environment_setup_commit", "before_repo_set_cmd", "bare repository")):
442
+ return "swe-cycle"
443
+ if any(token in text for token in ("swe-ci", "sweci", "swe ci", "run_tests", "define_requirements", "modify_code", "test gap", "current_sha", "target_sha", "ci-loop", "continuous integration loop")):
444
+ return "swe-ci"
445
+ if any(token in text for token in ("swe-prbench", "swe prbench", "swe-pr", "prbench", "pull request review", "code review quality", "human_review_comments", "diff_patch", "type2_contextual")):
446
+ return "swe-prbench"
447
+ if any(token in text for token in ("tml-bench", "tmlbench", "tabular ml", "kaggle-style", "kaggle style", "sample_submission", "private holdout", "train.csv", "test.csv")):
448
+ return "tml-bench"
449
+ if any(token in text for token in ("pi-bench", "pibench", "proactive personal assistant", "proactive assistant", "hidden intent", "latent intent", "user profile", "message history", "current app", "proactivity score", "completion score")):
450
+ return "pi-bench"
451
+ if any(token in text for token in ("saasbench", "saas-bench", "enterprise saas", "tenant", "migration")):
452
+ return "saasbench"
453
+ if any(token in text for token in ("roadmapbench", "roadmap-bench", "long-horizon", "version upgrade")):
454
+ return "roadmapbench"
455
+ if any(token in text for token in ("arc-agi", "arc prize", "kaggle arc")):
456
+ return "arc-agi"
457
+ return "generic"
458
+
459
+
460
+ def _profile_guidance(profile: str) -> str:
461
+ if profile == "appworld":
462
+ return "AppWorld discipline: track app/API state from observations, preserve record IDs and permissions, and finish only after the requested state change is confirmed."
463
+ if profile == "browsecomp":
464
+ return "BrowseComp+ discipline: decompose the research question, prefer primary/high-authority sources, cross-check facts, and include auditable source attribution in finish/message arguments."
465
+ if profile == "tau2":
466
+ return "tau2 discipline: read policy/context first, take only policy-supported tool actions, and confirm observations before promising customer outcomes."
467
+ if profile == "terminalworld":
468
+ return "TerminalWorld discipline: extract required artifacts from instruction.md/task text, avoid solve.sh/reference material, execute real CLI steps, and verify persistent files/services before finishing."
469
+ if profile == "swe-bench-mobile":
470
+ return "Mobile discipline: respect PRD/design/platform constraints and prefer platform validation evidence when the harness exposes it."
471
+ if profile == "webdevbench":
472
+ return "WebDevBench discipline: preserve canary business requirements, verify frontend-backend coupling, and seek production/security evidence before completion."
473
+ if profile == "swe-cycle":
474
+ return "SWE-Cycle discipline: carry lifecycle phase, environment setup state, implementation requirements, generated/selected tests, and static/dynamic judge evidence through each action."
475
+ if profile == "swe-ci":
476
+ return "SWE-CI discipline: carry current/target commits, test gaps, inferred requirements, code changes, and CI-loop validation deltas through each action."
477
+ if profile == "swe-prbench":
478
+ return "SWE-PRBench discipline: review diff first, expand only to evidence-needed context, and produce severity-rated findings with file/line evidence rather than patching unless explicitly requested."
479
+ if profile == "tml-bench":
480
+ return "TML-Bench discipline: establish data contract, avoid hidden-label leakage, validate an honest baseline, and produce a schema-valid submission artifact."
481
+ if profile == "pi-bench":
482
+ return "Pi-Bench discipline: build the user/workspace/app context contract, infer hidden intent with evidence, ask only necessary clarifying questions, and verify state after proactive actions."
483
+ if profile == "saasbench":
484
+ return "SaaS discipline: preserve tenant, auth, migration, and cross-component workflow integrity."
485
+ if profile == "roadmapbench":
486
+ return "Roadmap discipline: keep milestones explicit and avoid claiming completion while roadmap items remain unverified."
487
+ if profile == "arc-agi":
488
+ return "ARC discipline: infer environment dynamics with small experiments and avoid hardcoding hidden answers."
489
+ return "Generic discipline: use the available actions exactly, observe after state-changing actions, and finish only with benchmark-visible evidence."
490
+
491
+
492
+ def _find_action_type(actions: list[ActionType], name: str) -> ActionType | None:
493
+ for action in actions:
494
+ if action.name == name:
495
+ return action
496
+ lowered = name.lower()
497
+ for action in actions:
498
+ if action.name.lower() == lowered:
499
+ return action
500
+ return None
501
+
502
+
503
+ def _first_matching_action(actions: list[ActionType], predicate: Any) -> ActionType | None:
504
+ for action in actions:
505
+ if predicate(action):
506
+ return action
507
+ return None
508
+
509
+
510
+ def _normalize_arguments(action: ActionType, provided: dict[str, Any], fallback_text: str) -> dict[str, Any]:
511
+ args = dict(provided or {})
512
+ fields = _argument_fields(action)
513
+ if not fields:
514
+ return args
515
+ if any(key in args for key in fields):
516
+ return args
517
+
518
+ for key in ("answer", "final_answer", "response", "content", "message", "text", "result", "output"):
519
+ if key in fields:
520
+ args[key] = fallback_text
521
+ return args
522
+
523
+ for key, field in fields.items():
524
+ if _field_required(field):
525
+ args[key] = _fallback_value_for_field(field, fallback_text)
526
+ return args
527
+ return args
528
+
529
+
530
+ def _argument_fields(action: ActionType) -> dict[str, Any]:
531
+ args_type = getattr(action, "arguments", None)
532
+ return dict(getattr(args_type, "model_fields", {}) or getattr(args_type, "__fields__", {}) or {})
533
+
534
+
535
+ def _field_required(field: Any) -> bool:
536
+ method = getattr(field, "is_required", None)
537
+ if callable(method):
538
+ return bool(method())
539
+ return bool(getattr(field, "required", False))
540
+
541
+
542
+ def _fallback_value_for_field(field: Any, text: str) -> Any:
543
+ annotation = getattr(field, "annotation", None) or getattr(field, "type_", None)
544
+ if annotation is bool:
545
+ return False
546
+ if annotation is int:
547
+ return 0
548
+ if annotation is float:
549
+ return 0.0
550
+ if annotation is list:
551
+ return []
552
+ if annotation is dict:
553
+ return {}
554
+ return text
555
+
556
+
557
+ def _observation_is_empty(observation: Observation) -> bool:
558
+ try:
559
+ return bool(observation.is_empty())
560
+ except Exception:
561
+ return False
562
+
563
+
564
+ def _observation_to_data(observation: Observation) -> Any:
565
+ try:
566
+ items = observation.to_observation_list()
567
+ except Exception:
568
+ return str(observation)
569
+ data: list[Any] = []
570
+ for item in items:
571
+ result = getattr(item, "result", item)
572
+ data.append(result)
573
+ return data
574
+
575
+
576
+ def _single_action_to_data(action: Action) -> Any:
577
+ try:
578
+ values = []
579
+ for item in action.to_action_list():
580
+ args = getattr(item, "arguments", {})
581
+ if hasattr(args, "model_dump"):
582
+ args = args.model_dump()
583
+ values.append({"name": getattr(item, "name", ""), "arguments": args, "id": getattr(item, "id", "")})
584
+ return values
585
+ except Exception:
586
+ return {"id": safe_id(str(action)), "text": str(action)}
587
+
588
+
589
+ class CawdexAgentInstance(CawdexAgentInstance):
590
+ """Preferred Exgentic runtime class name for Cawdex."""
591
+
592
+
593
+ class CawdexAgent(CawdexAgent):
594
+ """Preferred Exgentic host class name for Cawdex."""
595
+
596
+ display_name: ClassVar[str] = "Cawdex"
597
+ slug_name: ClassVar[str] = "cawdex_agent"
598
+
599
+ @classmethod
600
+ def _get_instance_class(cls):
601
+ return CawdexAgentInstance
602
+
603
+ @classmethod
604
+ def _get_instance_class_ref(cls) -> str:
605
+ return f"{cls.__module__}:CawdexAgentInstance"