cawdex 1.35.75 → 1.35.77
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/bin/anycode.js +2 -2
- package/bin/cawdex.js +408 -408
- package/bin/ecc-hooks.cjs +11 -11
- package/dist/agents-md.d.ts +31 -0
- package/dist/agents-md.js +340 -0
- package/dist/agents-md.js.map +1 -0
- package/dist/agents.js +1424 -1424
- package/dist/api.d.ts +1 -0
- package/dist/api.js +19 -14
- package/dist/api.js.map +1 -1
- package/dist/autonomous-loops.js +287 -287
- package/dist/benchmark-repos.d.ts +31 -0
- package/dist/benchmark-repos.js +234 -8
- package/dist/benchmark-repos.js.map +1 -1
- package/dist/command-palette.js +5 -2
- package/dist/command-palette.js.map +1 -1
- package/dist/compaction.js +8 -8
- package/dist/config.js +57 -36
- package/dist/config.js.map +1 -1
- package/dist/content-engine.js +543 -543
- package/dist/context-brief.d.ts +4 -0
- package/dist/context-brief.js +230 -0
- package/dist/context-brief.js.map +1 -0
- package/dist/cost-tracker.d.ts +33 -14
- package/dist/cost-tracker.js +81 -19
- package/dist/cost-tracker.js.map +1 -1
- package/dist/coverage.js +39 -39
- package/dist/docs-sync.js +98 -98
- package/dist/evaluation.js +452 -452
- package/dist/fixed-footer.d.ts +11 -2
- package/dist/fixed-footer.js +115 -26
- package/dist/fixed-footer.js.map +1 -1
- package/dist/git-workflow.js +49 -49
- package/dist/imports.d.ts +126 -0
- package/dist/imports.js +611 -0
- package/dist/imports.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +367 -66
- package/dist/index.js.map +1 -1
- package/dist/inline-suggest.js +136 -12
- package/dist/inline-suggest.js.map +1 -1
- package/dist/live-queue.js +1 -1
- package/dist/live-queue.js.map +1 -1
- package/dist/model-aliases.d.ts +37 -0
- package/dist/model-aliases.js +203 -0
- package/dist/model-aliases.js.map +1 -0
- package/dist/orchestration.js +15 -15
- package/dist/permissions.d.ts +6 -0
- package/dist/permissions.js +53 -0
- package/dist/permissions.js.map +1 -1
- package/dist/pm2-manager.js +26 -26
- package/dist/query.d.ts +0 -1
- package/dist/query.js +105 -41
- package/dist/query.js.map +1 -1
- package/dist/refactor.js +87 -87
- package/dist/repo-command.js +7 -1
- package/dist/repo-command.js.map +1 -1
- package/dist/search-first.js +92 -92
- package/dist/skill-create.js +100 -100
- package/dist/stitch.js +1 -1
- package/dist/system-prompt.d.ts +2 -1
- package/dist/system-prompt.js +10 -5
- package/dist/system-prompt.js.map +1 -1
- package/dist/tools/github-repo-digest.d.ts +1 -1
- package/dist/tools/github-repo-digest.js +38 -6
- package/dist/tools/github-repo-digest.js.map +1 -1
- package/dist/types.d.ts +9 -0
- package/dist/types.js.map +1 -1
- package/dist/verification.js +55 -55
- package/package.json +1 -1
- package/resources/__init__.py +1 -1
- package/resources/exgentic/cawdex_agent/README.md +114 -114
- package/resources/exgentic/cawdex_agent/__init__.py +5 -5
- package/resources/exgentic/cawdex_agent/agent.py +605 -605
- package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
- package/resources/exgentic/cawdex_agent/setup.sh +21 -21
- package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
- package/resources/hal/cawdex_agent/README.md +24 -24
- package/resources/hal/cawdex_agent/__init__.py +1 -1
- package/resources/hal/cawdex_agent/main.py +550 -550
- package/resources/hal/cawdex_agent/requirements.txt +2 -2
- package/resources/kbench/cawdex_agent/README.md +107 -107
- package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
- package/resources/kbench/cawdex_agent/runner.mjs +753 -753
- package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
- package/resources/terminal_bench/__init__.py +1 -1
- package/resources/terminal_bench/cawdex_agent.py +174 -174
- package/resources/terminal_bench/setup.sh +121 -121
|
@@ -1,119 +1,119 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: Cawdex
|
|
3
|
-
version: 1.35.58
|
|
4
|
-
developers:
|
|
5
|
-
- Crownelius
|
|
6
|
-
license: MIT
|
|
7
|
-
repository: https://github.com/Crownelius/cawdex
|
|
8
|
-
framework: CLI agent with packaged benchmark adapters
|
|
9
|
-
models:
|
|
10
|
-
- configurable OpenAI-compatible model
|
|
11
|
-
tags:
|
|
12
|
-
- coding-agent
|
|
13
|
-
- general-agent
|
|
14
|
-
- open-agent-leaderboard
|
|
15
|
-
- exgentic
|
|
16
|
-
- terminal-bench
|
|
17
|
-
- kbench
|
|
18
|
-
- hal
|
|
19
|
-
- mempalace
|
|
20
|
-
---
|
|
21
|
-
|
|
22
|
-
# Cawdex Agent Card
|
|
23
|
-
|
|
24
|
-
## Agent Details
|
|
25
|
-
|
|
26
|
-
Cawdex is a terminal coding agent system with a mind for the whole repo. It runs as a CLI, speaks OpenAI-compatible APIs, and packages adapters for Terminal-Bench, KBench, HAL, and Exgentic/Open Agent Leaderboard style evaluation. The npm package is `cawdex` and installs both the primary `cawdex` command and the legacy `cawdex` alias.
|
|
27
|
-
|
|
28
|
-
This card documents the agent system. It is not an official benchmark result and does not claim leaderboard performance without official harness output.
|
|
29
|
-
|
|
30
|
-
## Architecture
|
|
31
|
-
|
|
32
|
-
Cawdex uses an iterative tool-calling loop with benchmark mode prompts, read-only preflight context, task-contract extraction, todo tracking, source-specific research, redacted benchmark traces, and structured process scoring.
|
|
33
|
-
|
|
34
|
-
Primary tool surfaces include shell execution, file read/write/edit/patch operations, search/glob/listing, `benchmark_context`, `harness_components`, `research_sources`, `benchmark_repo_catalog`, `github_repo_digest`, `todo_write`, web fetch/search, MemPalace memory tools, and progressive-disclosure skills.
|
|
35
|
-
|
|
36
|
-
Source-grounded benchmark work uses `research_sources` for current arXiv/GitHub/Hugging Face/Kaggle discovery and `benchmark_repo_catalog` as an offline Terminal-Bench public-repo seed, then `github_repo_digest` on relevant public repositories before adopting repo-level implementation patterns. The digest evidence is treated as a bounded demonstration to compare against local manifests, commands, CI files, and component surfaces, not as authority over the current checkout.
|
|
37
|
-
|
|
38
|
-
Benchmark mode emphasizes:
|
|
39
|
-
|
|
40
|
-
- Current task instructions and verifier output over prior memory.
|
|
41
|
-
- Task-contract checklist creation before edits.
|
|
42
|
-
- Task-alignment checks for ignored constraints, distractor/decoy references, and off-task-looking edits.
|
|
43
|
-
- Reproduction before repair when feasible.
|
|
44
|
-
- Live preflight checks for PATH, package-manager, interpreter, virtualenv, and network/offline mismatches before treating a verifier as representative.
|
|
45
|
-
- Narrow-to-broad validation after edits.
|
|
46
|
-
- Reward-hack checks for verifier tampering, oracle/solution probes, result-file edits, shortcut completion markers, and bypass commands.
|
|
47
|
-
- CI workflow reconstruction from visible configuration.
|
|
48
|
-
- Anti-leakage handling for oracle, answer, gold, hidden, result, and solution files.
|
|
49
|
-
- Bounded replay of prior read/search/verifier checkpoints as hypotheses, not patch recipes.
|
|
50
|
-
|
|
51
|
-
## Memory
|
|
52
|
-
|
|
53
|
-
Cawdex includes MemPalace-backed project and global memory. In benchmark mode, `benchmark_context` can surface bounded relevant memories and prior local benchmark experience cards. These are explicitly framed as hypotheses that must be verified against current task files and verifier output. For Pi-Bench-style tasks, prior-run ranking uses complete proactivity ledgers as positive evidence and routes proactivity-ledger defects to warnings instead of replay hints.
|
|
54
|
-
|
|
55
|
-
Benchmark traces write compact `experienceCard` summaries with replay checkpoints, failure signatures, component-observability edit classification, task-contract state and signals, task-alignment, spec-compliance, reward-hack, HarnessAudit-style harness-safety, long-horizon roadmap/SaaS/mobile/WebDevBench/SWE-Cycle/SWE-CI coverage risk signals, and Pi-Bench proactivity ledger signals for context contract, hidden-intent hypothesis, clarification, privacy, and observable completion evidence. They also capture environment-reconstruction setup/failure evidence, dependency-upgrade setup-validation evidence, root-cause hypothesis state, failure-onset diagnosis state, trajectory-triage categories for interaction/execution/environment informativeness, and targeted-fix counts for failed-verifier repair edits, decision-observability edit predictions, validation-reliability evidence, context-utilization precision/miss evidence plus pre-edit context-bloat evidence, trajectory-cleanup evidence for base64/data-URI blobs, high-entropy output, duplicate output, and excessive truncation, evidence-grounding signals for stale/no-effect edit retries without a current-state refresh, AHE publish-state mutation signals for post-pass edits or state-changing commands without revalidation, run-efficiency action/usage/cost/time evidence, source-research coverage, verification commands, changed files, and warnings. They also emit a redacted ACC-style task/context/answer JSONL artifact and an AHE-style change-evaluation artifact for edit prediction verdicts, unpredicted edits, targeted-fix counts, and post-edit regression-cycle attribution.
|
|
56
|
-
|
|
57
|
-
## Models
|
|
58
|
-
|
|
59
|
-
Cawdex classifies edit targets by changed surface and records explicit regression foresight for AHE-style component and decision observability. Non-trivial benchmark edits should name the component surface and include both `Prediction:` and `At-risk regression:` lines; failed-verifier repair edits should also record a `Root cause:`, `Diagnosis:`, or failure-tied `Hypothesis:` plus a `Targeted fix:` or `Fix plan:` before patching. Repeated failing-verifier reruns and post-edit regression cycles are compressed into a failure-onset diagnosis so repair loops can be inspected before another patch. Missing forecasts, diagnoses, failure-onset diagnoses, and fix plans appear in change evaluation, experience-card state, trajectory quality, process defects, and completion reminders.
|
|
60
|
-
|
|
61
|
-
Cawdex is model-agnostic across OpenAI-compatible providers. Common configurations include OpenRouter, OpenAI, NVIDIA, Ollama, LM Studio, and DeepSeek-compatible endpoints. The model and provider are selected by CLI flags or environment configuration.
|
|
62
|
-
|
|
63
|
-
## Supported Environments
|
|
64
|
-
|
|
65
|
-
Packaged evaluation surfaces:
|
|
66
|
-
|
|
67
|
-
- Terminal-Bench adapter.
|
|
68
|
-
- KBench custom adapter.
|
|
69
|
-
- HAL custom agent.
|
|
70
|
-
- Exgentic/Open Agent Leaderboard custom agent.
|
|
71
|
-
|
|
72
|
-
Benchmark mode is designed for SWE-bench-style code repair, Terminal-Bench and TerminalWorld-style terminal tasks, context-reuse benchmarks, long-horizon RoadmapBench/SaaSBench/SWE-Bench Mobile/SWE-WebDevBench/SWE-Cycle/SWE-CI-style tasks, SWE-PRBench-style pull request review tasks, TML-Bench/Kaggle-style tabular ML tasks, Pi-Bench-style proactive personal assistant tasks, Open Agent AppWorld/BrowseComp+/tau2-style tasks, and generic multi-step tool-use tasks. The TerminalWorld profile treats `instruction.md` or task text as the outcome contract, marks `solve.sh` as oracle-only reference material, and emphasizes real CLI execution plus persistent `/app` artifact/service verification for tasks synthesized from in-the-wild terminal recordings. The Exgentic adapter builds a deterministic recommended action shortlist from the current task, context, latest observation, profile, schemas, and recent diagnostics before showing the full action schemas, highlights required argument keys with redacted exact current-state hints when available, repairs case/camelCase/schema-key near misses and exact latest-observation/context required-argument omissions before `ActionType` dispatch, avoids repeating no-effect actions when the latest observation did not change, uses the shortlist/hints to recover from malformed or missing action JSON with a viable non-finish action while completion is not ready, then folds prior observations/actions into a compact task-relevant ledger between steps so long noisy sessions keep current state, policy evidence, TerminalWorld artifact requirements and reference-solution avoidance, WebDevBench canary requirements, frontend/backend evidence, SWE-Cycle lifecycle phases, environment setup state, generated/selected tests, judge evidence, SWE-CI current/target commits, test gaps, inferred requirements, verifier deltas, SWE-PRBench PR metadata/diff/findings/evidence gaps, TML-Bench data contract/validation/submission evidence, Pi-Bench user/workspace/app context, hidden-intent hypotheses, clarification state, privacy risk, selected actions, and diagnostics in view without repeatedly reinjecting raw transcripts. The interactive CLI acknowledges submitted turns before provider I/O, treats F5/Shift+F5 raw terminal sequences as active-turn cancellation on Windows/xterm terminals, and auto-heals known-stuck OpenRouter preview models to the configured free fallback unless explicitly overridden.
|
|
73
|
-
|
|
74
|
-
## Evaluation Results
|
|
75
|
-
|
|
76
|
-
Cawdex writes `summary.json`, `trace.jsonl`, `worktree.patch`, `git-status.txt`, `open-agent-leaderboard-draft.json`, `agent-context-compiled.jsonl`, `change-evaluation.json`, and `submission-bundle-manifest.json` artifacts when benchmark trace output is enabled. The draft row follows the public Open Agent Leaderboard result-column shape where local trace evidence can support it, but remains `submissionReady:false` until an official harness supplies benchmark-owned scores and session success evidence. The submission bundle manifest indexes artifact paths and SHA-256 hashes, summarizes verifier/usage/process evidence, and lists missing official fields so local traces are not mistaken for leaderboard scores. Trajectory quality includes explicit task-alignment, spec-compliance, reward-hack, HarnessAudit-style harness-safety, long-horizon coverage, Pi-Bench proactivity, component-observability, root-cause hypothesis, failure-onset diagnosis, trajectory triage, targeted-fix manifest, context-utilization, candidate-file dossier, trajectory cleanup, pre-edit context-bloat, weak change-manifest, and evidence-grounding risk fields so benchmark reviewers can separate genuine task progress from distractor-following, repair edits without a failure-tied diagnosis or fix plan, undiagnosed verifier repair loops, recall-heavy unused context gathering without a compact localization dossier, noisy encoded or duplicate tool observations, visible-suite-only validation, stale/no-effect edit loops, verifier tampering, unsafe protected-resource access, external information transfer, destructive operations, oracle access, shortcut score markers, unsupported RoadmapBench/SaaSBench/SWE-Bench Mobile/SWE-WebDevBench/SWE-Cycle/SWE-CI/Pi-Bench completion claims, or edits/actions without falsifiable prediction and observable follow-through.
|
|
77
|
-
|
|
78
|
-
Prior experience reuse also reads AHE change-evaluation verdicts. Confirmed manifests can rank higher, while contradicted, regression-risk, pending-verification, missing-prediction, or missing-regression-forecast manifests are emitted as warning patterns instead of replay hints.
|
|
79
|
-
|
|
80
|
-
Prior experience reuse also applies ContextBench-style context discipline: concise runs whose inspected files were used by the eventual patch can rank higher, while low-utilization or pre-edit context-bloat runs are emitted as warning patterns instead of replay hints.
|
|
81
|
-
|
|
82
|
-
Prior experience reuse also applies AHE-style cleanup discipline: traces with encoded blobs, duplicate observations, or excessive truncation are emitted as warning patterns instead of replay hints.
|
|
83
|
-
|
|
84
|
-
Prior experience reuse also applies AHE-style diagnosis discipline: traces that repaired after a failed verifier without a root-cause hypothesis are emitted as warning patterns instead of replay hints.
|
|
85
|
-
|
|
86
|
-
Prior experience reuse also applies failure-onset discipline: traces with repeated verifier loops or regression cycles that were repaired without a diagnosis are emitted as warning patterns instead of replay hints.
|
|
87
|
-
|
|
88
|
-
Prior experience reuse also applies AHE-style fix-plan discipline: traces that repaired after a failed verifier without a targeted-fix manifest are emitted as warning patterns instead of replay hints.
|
|
89
|
-
|
|
90
|
-
Interactive type-ahead is preserved through active-turn cancellation and permission interruptions: text captured while the model/tool chain is running is restored into the next editable prompt instead of being emitted as a passive queued hint or silently submitted.
|
|
91
|
-
|
|
92
|
-
Official results should be produced through Exgentic, HAL, Terminal-Bench, KBench, or another benchmark-owned grader before submission.
|
|
93
|
-
|
|
94
|
-
## Limitations
|
|
95
|
-
|
|
96
|
-
- Prior memory and replay traces can be stale or mismatched; current task evidence must override them.
|
|
97
|
-
- Generic web or source research cannot prove task success without local or official verifier evidence.
|
|
98
|
-
- The agent card does not substitute for official harness scoring.
|
|
99
|
-
- Hosted or sandboxed benchmarks may restrict network, package install, or provider access; use pinned bundles or preinstalled `cawdex` where possible. The legacy `cawdex` alias remains supported for older images.
|
|
100
|
-
- Open-weight or free-tier models may show high variance on long-horizon tasks.
|
|
101
|
-
|
|
102
|
-
## How To Run
|
|
103
|
-
|
|
104
|
-
Print packaged adapters and card paths:
|
|
105
|
-
|
|
106
|
-
```bash
|
|
107
|
-
cawdex --print-exgentic-agent
|
|
108
|
-
cawdex --print-hal-agent
|
|
109
|
-
cawdex --print-kbench-adapter
|
|
110
|
-
cawdex --print-open-agent-card
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
Run a benchmark task directly:
|
|
114
|
-
|
|
115
|
-
```bash
|
|
116
|
-
cawdex --provider openrouter --model openrouter/free --perm yolo --prompt "/benchmark swe-bench fix the issue"
|
|
117
|
-
```
|
|
118
|
-
|
|
119
|
-
For reproducible leaderboard runs, pin the installed package or bundle, pass provider credentials through environment variables, enable benchmark trace output, and submit only official harness results.
|
|
1
|
+
---
|
|
2
|
+
name: Cawdex
|
|
3
|
+
version: 1.35.58
|
|
4
|
+
developers:
|
|
5
|
+
- Crownelius
|
|
6
|
+
license: MIT
|
|
7
|
+
repository: https://github.com/Crownelius/cawdex
|
|
8
|
+
framework: CLI agent with packaged benchmark adapters
|
|
9
|
+
models:
|
|
10
|
+
- configurable OpenAI-compatible model
|
|
11
|
+
tags:
|
|
12
|
+
- coding-agent
|
|
13
|
+
- general-agent
|
|
14
|
+
- open-agent-leaderboard
|
|
15
|
+
- exgentic
|
|
16
|
+
- terminal-bench
|
|
17
|
+
- kbench
|
|
18
|
+
- hal
|
|
19
|
+
- mempalace
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
# Cawdex Agent Card
|
|
23
|
+
|
|
24
|
+
## Agent Details
|
|
25
|
+
|
|
26
|
+
Cawdex is a terminal coding agent system with a mind for the whole repo. It runs as a CLI, speaks OpenAI-compatible APIs, and packages adapters for Terminal-Bench, KBench, HAL, and Exgentic/Open Agent Leaderboard style evaluation. The npm package is `cawdex` and installs both the primary `cawdex` command and the legacy `cawdex` alias.
|
|
27
|
+
|
|
28
|
+
This card documents the agent system. It is not an official benchmark result and does not claim leaderboard performance without official harness output.
|
|
29
|
+
|
|
30
|
+
## Architecture
|
|
31
|
+
|
|
32
|
+
Cawdex uses an iterative tool-calling loop with benchmark mode prompts, read-only preflight context, task-contract extraction, todo tracking, source-specific research, redacted benchmark traces, and structured process scoring.
|
|
33
|
+
|
|
34
|
+
Primary tool surfaces include shell execution, file read/write/edit/patch operations, search/glob/listing, `benchmark_context`, `harness_components`, `research_sources`, `benchmark_repo_catalog`, `github_repo_digest`, `todo_write`, web fetch/search, MemPalace memory tools, and progressive-disclosure skills.
|
|
35
|
+
|
|
36
|
+
Source-grounded benchmark work uses `research_sources` for current arXiv/GitHub/Hugging Face/Kaggle discovery and `benchmark_repo_catalog` as an offline Terminal-Bench public-repo seed, then `github_repo_digest` on relevant public repositories before adopting repo-level implementation patterns. The digest evidence is treated as a bounded demonstration to compare against local manifests, commands, CI files, and component surfaces, not as authority over the current checkout.
|
|
37
|
+
|
|
38
|
+
Benchmark mode emphasizes:
|
|
39
|
+
|
|
40
|
+
- Current task instructions and verifier output over prior memory.
|
|
41
|
+
- Task-contract checklist creation before edits.
|
|
42
|
+
- Task-alignment checks for ignored constraints, distractor/decoy references, and off-task-looking edits.
|
|
43
|
+
- Reproduction before repair when feasible.
|
|
44
|
+
- Live preflight checks for PATH, package-manager, interpreter, virtualenv, and network/offline mismatches before treating a verifier as representative.
|
|
45
|
+
- Narrow-to-broad validation after edits.
|
|
46
|
+
- Reward-hack checks for verifier tampering, oracle/solution probes, result-file edits, shortcut completion markers, and bypass commands.
|
|
47
|
+
- CI workflow reconstruction from visible configuration.
|
|
48
|
+
- Anti-leakage handling for oracle, answer, gold, hidden, result, and solution files.
|
|
49
|
+
- Bounded replay of prior read/search/verifier checkpoints as hypotheses, not patch recipes.
|
|
50
|
+
|
|
51
|
+
## Memory
|
|
52
|
+
|
|
53
|
+
Cawdex includes MemPalace-backed project and global memory. In benchmark mode, `benchmark_context` can surface bounded relevant memories and prior local benchmark experience cards. These are explicitly framed as hypotheses that must be verified against current task files and verifier output. For Pi-Bench-style tasks, prior-run ranking uses complete proactivity ledgers as positive evidence and routes proactivity-ledger defects to warnings instead of replay hints.
|
|
54
|
+
|
|
55
|
+
Benchmark traces write compact `experienceCard` summaries with replay checkpoints, failure signatures, component-observability edit classification, task-contract state and signals, task-alignment, spec-compliance, reward-hack, HarnessAudit-style harness-safety, long-horizon roadmap/SaaS/mobile/WebDevBench/SWE-Cycle/SWE-CI coverage risk signals, and Pi-Bench proactivity ledger signals for context contract, hidden-intent hypothesis, clarification, privacy, and observable completion evidence. They also capture environment-reconstruction setup/failure evidence, dependency-upgrade setup-validation evidence, root-cause hypothesis state, failure-onset diagnosis state, trajectory-triage categories for interaction/execution/environment informativeness, and targeted-fix counts for failed-verifier repair edits, decision-observability edit predictions, validation-reliability evidence, context-utilization precision/miss evidence plus pre-edit context-bloat evidence, trajectory-cleanup evidence for base64/data-URI blobs, high-entropy output, duplicate output, and excessive truncation, evidence-grounding signals for stale/no-effect edit retries without a current-state refresh, AHE publish-state mutation signals for post-pass edits or state-changing commands without revalidation, run-efficiency action/usage/cost/time evidence, source-research coverage, verification commands, changed files, and warnings. They also emit a redacted ACC-style task/context/answer JSONL artifact and an AHE-style change-evaluation artifact for edit prediction verdicts, unpredicted edits, targeted-fix counts, and post-edit regression-cycle attribution.
|
|
56
|
+
|
|
57
|
+
## Models
|
|
58
|
+
|
|
59
|
+
Cawdex classifies edit targets by changed surface and records explicit regression foresight for AHE-style component and decision observability. Non-trivial benchmark edits should name the component surface and include both `Prediction:` and `At-risk regression:` lines; failed-verifier repair edits should also record a `Root cause:`, `Diagnosis:`, or failure-tied `Hypothesis:` plus a `Targeted fix:` or `Fix plan:` before patching. Repeated failing-verifier reruns and post-edit regression cycles are compressed into a failure-onset diagnosis so repair loops can be inspected before another patch. Missing forecasts, diagnoses, failure-onset diagnoses, and fix plans appear in change evaluation, experience-card state, trajectory quality, process defects, and completion reminders.
|
|
60
|
+
|
|
61
|
+
Cawdex is model-agnostic across OpenAI-compatible providers. Common configurations include OpenRouter, OpenAI, NVIDIA, Ollama, LM Studio, and DeepSeek-compatible endpoints. The model and provider are selected by CLI flags or environment configuration.
|
|
62
|
+
|
|
63
|
+
## Supported Environments
|
|
64
|
+
|
|
65
|
+
Packaged evaluation surfaces:
|
|
66
|
+
|
|
67
|
+
- Terminal-Bench adapter.
|
|
68
|
+
- KBench custom adapter.
|
|
69
|
+
- HAL custom agent.
|
|
70
|
+
- Exgentic/Open Agent Leaderboard custom agent.
|
|
71
|
+
|
|
72
|
+
Benchmark mode is designed for SWE-bench-style code repair, Terminal-Bench and TerminalWorld-style terminal tasks, context-reuse benchmarks, long-horizon RoadmapBench/SaaSBench/SWE-Bench Mobile/SWE-WebDevBench/SWE-Cycle/SWE-CI-style tasks, SWE-PRBench-style pull request review tasks, TML-Bench/Kaggle-style tabular ML tasks, Pi-Bench-style proactive personal assistant tasks, Open Agent AppWorld/BrowseComp+/tau2-style tasks, and generic multi-step tool-use tasks. The TerminalWorld profile treats `instruction.md` or task text as the outcome contract, marks `solve.sh` as oracle-only reference material, and emphasizes real CLI execution plus persistent `/app` artifact/service verification for tasks synthesized from in-the-wild terminal recordings. The Exgentic adapter builds a deterministic recommended action shortlist from the current task, context, latest observation, profile, schemas, and recent diagnostics before showing the full action schemas, highlights required argument keys with redacted exact current-state hints when available, repairs case/camelCase/schema-key near misses and exact latest-observation/context required-argument omissions before `ActionType` dispatch, avoids repeating no-effect actions when the latest observation did not change, uses the shortlist/hints to recover from malformed or missing action JSON with a viable non-finish action while completion is not ready, then folds prior observations/actions into a compact task-relevant ledger between steps so long noisy sessions keep current state, policy evidence, TerminalWorld artifact requirements and reference-solution avoidance, WebDevBench canary requirements, frontend/backend evidence, SWE-Cycle lifecycle phases, environment setup state, generated/selected tests, judge evidence, SWE-CI current/target commits, test gaps, inferred requirements, verifier deltas, SWE-PRBench PR metadata/diff/findings/evidence gaps, TML-Bench data contract/validation/submission evidence, Pi-Bench user/workspace/app context, hidden-intent hypotheses, clarification state, privacy risk, selected actions, and diagnostics in view without repeatedly reinjecting raw transcripts. The interactive CLI acknowledges submitted turns before provider I/O, treats F5/Shift+F5 raw terminal sequences as active-turn cancellation on Windows/xterm terminals, and auto-heals known-stuck OpenRouter preview models to the configured free fallback unless explicitly overridden.
|
|
73
|
+
|
|
74
|
+
## Evaluation Results
|
|
75
|
+
|
|
76
|
+
Cawdex writes `summary.json`, `trace.jsonl`, `worktree.patch`, `git-status.txt`, `open-agent-leaderboard-draft.json`, `agent-context-compiled.jsonl`, `change-evaluation.json`, and `submission-bundle-manifest.json` artifacts when benchmark trace output is enabled. The draft row follows the public Open Agent Leaderboard result-column shape where local trace evidence can support it, but remains `submissionReady:false` until an official harness supplies benchmark-owned scores and session success evidence. The submission bundle manifest indexes artifact paths and SHA-256 hashes, summarizes verifier/usage/process evidence, and lists missing official fields so local traces are not mistaken for leaderboard scores. Trajectory quality includes explicit task-alignment, spec-compliance, reward-hack, HarnessAudit-style harness-safety, long-horizon coverage, Pi-Bench proactivity, component-observability, root-cause hypothesis, failure-onset diagnosis, trajectory triage, targeted-fix manifest, context-utilization, candidate-file dossier, trajectory cleanup, pre-edit context-bloat, weak change-manifest, and evidence-grounding risk fields so benchmark reviewers can separate genuine task progress from distractor-following, repair edits without a failure-tied diagnosis or fix plan, undiagnosed verifier repair loops, recall-heavy unused context gathering without a compact localization dossier, noisy encoded or duplicate tool observations, visible-suite-only validation, stale/no-effect edit loops, verifier tampering, unsafe protected-resource access, external information transfer, destructive operations, oracle access, shortcut score markers, unsupported RoadmapBench/SaaSBench/SWE-Bench Mobile/SWE-WebDevBench/SWE-Cycle/SWE-CI/Pi-Bench completion claims, or edits/actions without falsifiable prediction and observable follow-through.
|
|
77
|
+
|
|
78
|
+
Prior experience reuse also reads AHE change-evaluation verdicts. Confirmed manifests can rank higher, while contradicted, regression-risk, pending-verification, missing-prediction, or missing-regression-forecast manifests are emitted as warning patterns instead of replay hints.
|
|
79
|
+
|
|
80
|
+
Prior experience reuse also applies ContextBench-style context discipline: concise runs whose inspected files were used by the eventual patch can rank higher, while low-utilization or pre-edit context-bloat runs are emitted as warning patterns instead of replay hints.
|
|
81
|
+
|
|
82
|
+
Prior experience reuse also applies AHE-style cleanup discipline: traces with encoded blobs, duplicate observations, or excessive truncation are emitted as warning patterns instead of replay hints.
|
|
83
|
+
|
|
84
|
+
Prior experience reuse also applies AHE-style diagnosis discipline: traces that repaired after a failed verifier without a root-cause hypothesis are emitted as warning patterns instead of replay hints.
|
|
85
|
+
|
|
86
|
+
Prior experience reuse also applies failure-onset discipline: traces with repeated verifier loops or regression cycles that were repaired without a diagnosis are emitted as warning patterns instead of replay hints.
|
|
87
|
+
|
|
88
|
+
Prior experience reuse also applies AHE-style fix-plan discipline: traces that repaired after a failed verifier without a targeted-fix manifest are emitted as warning patterns instead of replay hints.
|
|
89
|
+
|
|
90
|
+
Interactive type-ahead is preserved through active-turn cancellation and permission interruptions: text captured while the model/tool chain is running is restored into the next editable prompt instead of being emitted as a passive queued hint or silently submitted.
|
|
91
|
+
|
|
92
|
+
Official results should be produced through Exgentic, HAL, Terminal-Bench, KBench, or another benchmark-owned grader before submission.
|
|
93
|
+
|
|
94
|
+
## Limitations
|
|
95
|
+
|
|
96
|
+
- Prior memory and replay traces can be stale or mismatched; current task evidence must override them.
|
|
97
|
+
- Generic web or source research cannot prove task success without local or official verifier evidence.
|
|
98
|
+
- The agent card does not substitute for official harness scoring.
|
|
99
|
+
- Hosted or sandboxed benchmarks may restrict network, package install, or provider access; use pinned bundles or preinstalled `cawdex` where possible. The legacy `cawdex` alias remains supported for older images.
|
|
100
|
+
- Open-weight or free-tier models may show high variance on long-horizon tasks.
|
|
101
|
+
|
|
102
|
+
## How To Run
|
|
103
|
+
|
|
104
|
+
Print packaged adapters and card paths:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
cawdex --print-exgentic-agent
|
|
108
|
+
cawdex --print-hal-agent
|
|
109
|
+
cawdex --print-kbench-adapter
|
|
110
|
+
cawdex --print-open-agent-card
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Run a benchmark task directly:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
cawdex --provider openrouter --model openrouter/free --perm yolo --prompt "/benchmark swe-bench fix the issue"
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
For reproducible leaderboard runs, pin the installed package or bundle, pass provider credentials through environment variables, enable benchmark trace output, and submit only official harness results.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
"""cawdex Terminal-Bench adapter package."""
|
|
1
|
+
"""cawdex Terminal-Bench adapter package."""
|
|
@@ -1,174 +1,174 @@
|
|
|
1
|
-
"""Terminal-Bench adapter for Cawdex.
|
|
2
|
-
|
|
3
|
-
Usage:
|
|
4
|
-
tb run --agent-import-path resources.terminal_bench.cawdex_agent:CawdexTerminalBenchAgent ...
|
|
5
|
-
|
|
6
|
-
The adapter installs the npm package in the task container, then runs
|
|
7
|
-
Cawdex in non-interactive benchmark mode with the task instruction.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from __future__ import annotations
|
|
11
|
-
|
|
12
|
-
import os
|
|
13
|
-
import shlex
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
|
|
16
|
-
from terminal_bench.agents.installed_agents.abstract_installed_agent import (
|
|
17
|
-
AbstractInstalledAgent,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
try:
|
|
21
|
-
from terminal_bench.harness_models import TerminalCommand
|
|
22
|
-
except ImportError: # terminal-bench moved this in newer releases
|
|
23
|
-
from terminal_bench.terminal.models import TerminalCommand
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class CawdexTerminalBenchAgent(AbstractInstalledAgent):
|
|
27
|
-
"""Terminal-Bench agent for Cawdex."""
|
|
28
|
-
|
|
29
|
-
@staticmethod
|
|
30
|
-
def name() -> str:
|
|
31
|
-
return "cawdex"
|
|
32
|
-
|
|
33
|
-
def __init__(
|
|
34
|
-
self,
|
|
35
|
-
model_name: str | None = None,
|
|
36
|
-
provider: str | None = None,
|
|
37
|
-
install_spec: str | None = None,
|
|
38
|
-
max_turns: int | None = None,
|
|
39
|
-
*args,
|
|
40
|
-
**kwargs,
|
|
41
|
-
):
|
|
42
|
-
super().__init__(*args, **kwargs)
|
|
43
|
-
self._model_name = model_name
|
|
44
|
-
self._provider = provider
|
|
45
|
-
self._install_spec = install_spec
|
|
46
|
-
self._max_turns = max_turns
|
|
47
|
-
|
|
48
|
-
@property
|
|
49
|
-
def _env(self) -> dict[str, str]:
|
|
50
|
-
env: dict[str, str] = {
|
|
51
|
-
"CAWDEX_ENV_CONFIG": "1",
|
|
52
|
-
"CAWDEX_HOME": "/tmp/cawdex-home",
|
|
53
|
-
"CAWDEX_THEME": "minimal",
|
|
54
|
-
"CAWDEX_SHOW_THINKING": "0",
|
|
55
|
-
"CAWDEX_MEMORY": os.environ.get("CAWDEX_MEMORY", "0"),
|
|
56
|
-
"CAWDEX_BASH_TIMEOUT_MS": os.environ.get("CAWDEX_BASH_TIMEOUT_MS", "300000"),
|
|
57
|
-
"CAWDEX_INSTALL_SPEC": self._install_spec
|
|
58
|
-
or os.environ.get("CAWDEX_INSTALL_SPEC", "cawdex@latest"),
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
passthrough = [
|
|
62
|
-
"CAWDEX_API_KEY",
|
|
63
|
-
"CAWDEX_BASE_URL",
|
|
64
|
-
"CAWDEX_MODEL",
|
|
65
|
-
"CAWDEX_FALLBACK_MODEL",
|
|
66
|
-
"CAWDEX_MAX_TOKENS",
|
|
67
|
-
"CAWDEX_CONTEXT_WINDOW_TOKENS",
|
|
68
|
-
"CAWDEX_COMPACTION_TRIGGER_TOKENS",
|
|
69
|
-
"CAWDEX_COMPACTION_MODEL",
|
|
70
|
-
"CAWDEX_COMPACTION_MAX_TOKENS",
|
|
71
|
-
"CAWDEX_COMPACTION_USE_FALLBACK",
|
|
72
|
-
"CAWDEX_LLM_COMPACTION",
|
|
73
|
-
"CAWDEX_COMPACTION_MODE",
|
|
74
|
-
"CAWDEX_LOCAL_COMPACTION_FALLBACK",
|
|
75
|
-
"CAWDEX_TEMPERATURE",
|
|
76
|
-
"CAWDEX_BUNDLE_ROOT",
|
|
77
|
-
"CAWDEX_BUNDLE_TARBALL",
|
|
78
|
-
"OPENROUTER_API_KEY",
|
|
79
|
-
"OPENAI_API_KEY",
|
|
80
|
-
"DEEPSEEK_API_KEY",
|
|
81
|
-
"NVIDIA_API_KEY",
|
|
82
|
-
"GOOGLE_API_KEY",
|
|
83
|
-
"GEMINI_API_KEY",
|
|
84
|
-
"GLM_API_KEY",
|
|
85
|
-
"ZHIPUAI_API_KEY",
|
|
86
|
-
]
|
|
87
|
-
for key in passthrough:
|
|
88
|
-
if os.environ.get(key):
|
|
89
|
-
env[key] = os.environ[key]
|
|
90
|
-
|
|
91
|
-
if self._provider:
|
|
92
|
-
env["CAWDEX_PROVIDER"] = self._provider
|
|
93
|
-
elif os.environ.get("CAWDEX_PROVIDER"):
|
|
94
|
-
env["CAWDEX_PROVIDER"] = os.environ["CAWDEX_PROVIDER"]
|
|
95
|
-
|
|
96
|
-
if self._model_name:
|
|
97
|
-
env["CAWDEX_MODEL"] = self._model_name
|
|
98
|
-
if self._max_turns:
|
|
99
|
-
env["CAWDEX_MAX_TURNS"] = str(self._max_turns)
|
|
100
|
-
elif os.environ.get("CAWDEX_MAX_TURNS"):
|
|
101
|
-
env["CAWDEX_MAX_TURNS"] = os.environ["CAWDEX_MAX_TURNS"]
|
|
102
|
-
|
|
103
|
-
return env
|
|
104
|
-
|
|
105
|
-
@property
|
|
106
|
-
def _install_agent_script_path(self) -> Path:
|
|
107
|
-
return Path(__file__).parent / "setup.sh"
|
|
108
|
-
|
|
109
|
-
def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
|
|
110
|
-
instruction = "/benchmark terminal-bench " + task_description
|
|
111
|
-
agent_command = (
|
|
112
|
-
"cawdex "
|
|
113
|
-
f"--prompt {shlex.quote(instruction)} "
|
|
114
|
-
"--perm yolo "
|
|
115
|
-
"--benchmark-trace-dir .cawdex/trace"
|
|
116
|
-
)
|
|
117
|
-
script = (
|
|
118
|
-
f"{agent_command}; "
|
|
119
|
-
"status=$?; "
|
|
120
|
-
"mkdir -p .cawdex; "
|
|
121
|
-
"redact_cawdex_artifact() { "
|
|
122
|
-
"sed -E "
|
|
123
|
-
"-e 's/sk-or-v1-[A-Za-z0-9_-]+/sk-or-v1-[REDACTED]/g' "
|
|
124
|
-
"-e 's/sk-[A-Za-z0-9_-]{16,}/sk-[REDACTED]/g' "
|
|
125
|
-
"-e 's/hf_[A-Za-z0-9]{16,}/hf_[REDACTED]/g' "
|
|
126
|
-
"-e 's/KGAT_[A-Za-z0-9]{16,}/KGAT_[REDACTED]/g' "
|
|
127
|
-
"-e 's/npm_[A-Za-z0-9]{16,}/npm_[REDACTED]/g'; "
|
|
128
|
-
"}; "
|
|
129
|
-
"summary=$(find .cawdex/trace -name summary.json -type f 2>/dev/null | sort | tail -n 1 || true); "
|
|
130
|
-
"if [ -n \"$summary\" ] && [ -f \"$summary\" ]; then "
|
|
131
|
-
"cp \"$summary\" .cawdex/benchmark-summary.json; "
|
|
132
|
-
"trace_dir=$(dirname \"$summary\"); "
|
|
133
|
-
"if [ -f \"$trace_dir/trace.jsonl\" ]; then cp \"$trace_dir/trace.jsonl\" .cawdex/benchmark-trace.jsonl; fi; "
|
|
134
|
-
"if [ -f \"$trace_dir/agent-context-compiled.jsonl\" ]; then cp \"$trace_dir/agent-context-compiled.jsonl\" .cawdex/agent-context-compiled.jsonl; fi; "
|
|
135
|
-
"if [ -f \"$trace_dir/change-evaluation.json\" ]; then cp \"$trace_dir/change-evaluation.json\" .cawdex/change-evaluation.json; fi; "
|
|
136
|
-
"if [ -f \"$trace_dir/submission-bundle-manifest.json\" ]; then cp \"$trace_dir/submission-bundle-manifest.json\" .cawdex/submission-bundle-manifest.json; fi; "
|
|
137
|
-
"fi; "
|
|
138
|
-
"if git rev-parse --is-inside-work-tree >/dev/null 2>&1; then "
|
|
139
|
-
"{ git diff --binary --no-ext-diff 2>/dev/null || true; "
|
|
140
|
-
"git diff --cached --binary --no-ext-diff 2>/dev/null || true; "
|
|
141
|
-
"git ls-files --others --exclude-standard -z 2>/dev/null | "
|
|
142
|
-
"while IFS= read -r -d '' f; do "
|
|
143
|
-
"git diff --no-index --binary --no-ext-diff -- /dev/null \"$f\" 2>/dev/null || true; "
|
|
144
|
-
"done; } | redact_cawdex_artifact > .cawdex/benchmark.patch; "
|
|
145
|
-
"git status --short 2>/dev/null | redact_cawdex_artifact > .cawdex/git-status.txt || true; "
|
|
146
|
-
"fi; "
|
|
147
|
-
"if [ -s .cawdex/benchmark.patch ]; then "
|
|
148
|
-
"echo '[cawdex] patch artifact: .cawdex/benchmark.patch'; "
|
|
149
|
-
"fi; "
|
|
150
|
-
"if [ -s .cawdex/benchmark-summary.json ]; then "
|
|
151
|
-
"echo '[cawdex] trace summary: .cawdex/benchmark-summary.json'; "
|
|
152
|
-
"fi; "
|
|
153
|
-
"if [ -s .cawdex/benchmark-trace.jsonl ]; then "
|
|
154
|
-
"echo '[cawdex] tool trace: .cawdex/benchmark-trace.jsonl'; "
|
|
155
|
-
"fi; "
|
|
156
|
-
"if [ -s .cawdex/agent-context-compiled.jsonl ]; then "
|
|
157
|
-
"echo '[cawdex] context compilation: .cawdex/agent-context-compiled.jsonl'; "
|
|
158
|
-
"fi; "
|
|
159
|
-
"if [ -s .cawdex/change-evaluation.json ]; then "
|
|
160
|
-
"echo '[cawdex] change evaluation: .cawdex/change-evaluation.json'; "
|
|
161
|
-
"fi; "
|
|
162
|
-
"if [ -s .cawdex/submission-bundle-manifest.json ]; then "
|
|
163
|
-
"echo '[cawdex] submission bundle: .cawdex/submission-bundle-manifest.json'; "
|
|
164
|
-
"fi; "
|
|
165
|
-
"exit \"$status\""
|
|
166
|
-
)
|
|
167
|
-
command = "bash -lc " + shlex.quote(script)
|
|
168
|
-
return [
|
|
169
|
-
TerminalCommand(
|
|
170
|
-
command=command,
|
|
171
|
-
max_timeout_sec=float("inf"),
|
|
172
|
-
block=True,
|
|
173
|
-
)
|
|
174
|
-
]
|
|
1
|
+
"""Terminal-Bench adapter for Cawdex.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
tb run --agent-import-path resources.terminal_bench.cawdex_agent:CawdexTerminalBenchAgent ...
|
|
5
|
+
|
|
6
|
+
The adapter installs the npm package in the task container, then runs
|
|
7
|
+
Cawdex in non-interactive benchmark mode with the task instruction.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import shlex
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from terminal_bench.agents.installed_agents.abstract_installed_agent import (
|
|
17
|
+
AbstractInstalledAgent,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from terminal_bench.harness_models import TerminalCommand
|
|
22
|
+
except ImportError: # terminal-bench moved this in newer releases
|
|
23
|
+
from terminal_bench.terminal.models import TerminalCommand
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CawdexTerminalBenchAgent(AbstractInstalledAgent):
|
|
27
|
+
"""Terminal-Bench agent for Cawdex."""
|
|
28
|
+
|
|
29
|
+
@staticmethod
|
|
30
|
+
def name() -> str:
|
|
31
|
+
return "cawdex"
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
model_name: str | None = None,
|
|
36
|
+
provider: str | None = None,
|
|
37
|
+
install_spec: str | None = None,
|
|
38
|
+
max_turns: int | None = None,
|
|
39
|
+
*args,
|
|
40
|
+
**kwargs,
|
|
41
|
+
):
|
|
42
|
+
super().__init__(*args, **kwargs)
|
|
43
|
+
self._model_name = model_name
|
|
44
|
+
self._provider = provider
|
|
45
|
+
self._install_spec = install_spec
|
|
46
|
+
self._max_turns = max_turns
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def _env(self) -> dict[str, str]:
|
|
50
|
+
env: dict[str, str] = {
|
|
51
|
+
"CAWDEX_ENV_CONFIG": "1",
|
|
52
|
+
"CAWDEX_HOME": "/tmp/cawdex-home",
|
|
53
|
+
"CAWDEX_THEME": "minimal",
|
|
54
|
+
"CAWDEX_SHOW_THINKING": "0",
|
|
55
|
+
"CAWDEX_MEMORY": os.environ.get("CAWDEX_MEMORY", "0"),
|
|
56
|
+
"CAWDEX_BASH_TIMEOUT_MS": os.environ.get("CAWDEX_BASH_TIMEOUT_MS", "300000"),
|
|
57
|
+
"CAWDEX_INSTALL_SPEC": self._install_spec
|
|
58
|
+
or os.environ.get("CAWDEX_INSTALL_SPEC", "cawdex@latest"),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
passthrough = [
|
|
62
|
+
"CAWDEX_API_KEY",
|
|
63
|
+
"CAWDEX_BASE_URL",
|
|
64
|
+
"CAWDEX_MODEL",
|
|
65
|
+
"CAWDEX_FALLBACK_MODEL",
|
|
66
|
+
"CAWDEX_MAX_TOKENS",
|
|
67
|
+
"CAWDEX_CONTEXT_WINDOW_TOKENS",
|
|
68
|
+
"CAWDEX_COMPACTION_TRIGGER_TOKENS",
|
|
69
|
+
"CAWDEX_COMPACTION_MODEL",
|
|
70
|
+
"CAWDEX_COMPACTION_MAX_TOKENS",
|
|
71
|
+
"CAWDEX_COMPACTION_USE_FALLBACK",
|
|
72
|
+
"CAWDEX_LLM_COMPACTION",
|
|
73
|
+
"CAWDEX_COMPACTION_MODE",
|
|
74
|
+
"CAWDEX_LOCAL_COMPACTION_FALLBACK",
|
|
75
|
+
"CAWDEX_TEMPERATURE",
|
|
76
|
+
"CAWDEX_BUNDLE_ROOT",
|
|
77
|
+
"CAWDEX_BUNDLE_TARBALL",
|
|
78
|
+
"OPENROUTER_API_KEY",
|
|
79
|
+
"OPENAI_API_KEY",
|
|
80
|
+
"DEEPSEEK_API_KEY",
|
|
81
|
+
"NVIDIA_API_KEY",
|
|
82
|
+
"GOOGLE_API_KEY",
|
|
83
|
+
"GEMINI_API_KEY",
|
|
84
|
+
"GLM_API_KEY",
|
|
85
|
+
"ZHIPUAI_API_KEY",
|
|
86
|
+
]
|
|
87
|
+
for key in passthrough:
|
|
88
|
+
if os.environ.get(key):
|
|
89
|
+
env[key] = os.environ[key]
|
|
90
|
+
|
|
91
|
+
if self._provider:
|
|
92
|
+
env["CAWDEX_PROVIDER"] = self._provider
|
|
93
|
+
elif os.environ.get("CAWDEX_PROVIDER"):
|
|
94
|
+
env["CAWDEX_PROVIDER"] = os.environ["CAWDEX_PROVIDER"]
|
|
95
|
+
|
|
96
|
+
if self._model_name:
|
|
97
|
+
env["CAWDEX_MODEL"] = self._model_name
|
|
98
|
+
if self._max_turns:
|
|
99
|
+
env["CAWDEX_MAX_TURNS"] = str(self._max_turns)
|
|
100
|
+
elif os.environ.get("CAWDEX_MAX_TURNS"):
|
|
101
|
+
env["CAWDEX_MAX_TURNS"] = os.environ["CAWDEX_MAX_TURNS"]
|
|
102
|
+
|
|
103
|
+
return env
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def _install_agent_script_path(self) -> Path:
|
|
107
|
+
return Path(__file__).parent / "setup.sh"
|
|
108
|
+
|
|
109
|
+
def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
|
|
110
|
+
instruction = "/benchmark terminal-bench " + task_description
|
|
111
|
+
agent_command = (
|
|
112
|
+
"cawdex "
|
|
113
|
+
f"--prompt {shlex.quote(instruction)} "
|
|
114
|
+
"--perm yolo "
|
|
115
|
+
"--benchmark-trace-dir .cawdex/trace"
|
|
116
|
+
)
|
|
117
|
+
script = (
|
|
118
|
+
f"{agent_command}; "
|
|
119
|
+
"status=$?; "
|
|
120
|
+
"mkdir -p .cawdex; "
|
|
121
|
+
"redact_cawdex_artifact() { "
|
|
122
|
+
"sed -E "
|
|
123
|
+
"-e 's/sk-or-v1-[A-Za-z0-9_-]+/sk-or-v1-[REDACTED]/g' "
|
|
124
|
+
"-e 's/sk-[A-Za-z0-9_-]{16,}/sk-[REDACTED]/g' "
|
|
125
|
+
"-e 's/hf_[A-Za-z0-9]{16,}/hf_[REDACTED]/g' "
|
|
126
|
+
"-e 's/KGAT_[A-Za-z0-9]{16,}/KGAT_[REDACTED]/g' "
|
|
127
|
+
"-e 's/npm_[A-Za-z0-9]{16,}/npm_[REDACTED]/g'; "
|
|
128
|
+
"}; "
|
|
129
|
+
"summary=$(find .cawdex/trace -name summary.json -type f 2>/dev/null | sort | tail -n 1 || true); "
|
|
130
|
+
"if [ -n \"$summary\" ] && [ -f \"$summary\" ]; then "
|
|
131
|
+
"cp \"$summary\" .cawdex/benchmark-summary.json; "
|
|
132
|
+
"trace_dir=$(dirname \"$summary\"); "
|
|
133
|
+
"if [ -f \"$trace_dir/trace.jsonl\" ]; then cp \"$trace_dir/trace.jsonl\" .cawdex/benchmark-trace.jsonl; fi; "
|
|
134
|
+
"if [ -f \"$trace_dir/agent-context-compiled.jsonl\" ]; then cp \"$trace_dir/agent-context-compiled.jsonl\" .cawdex/agent-context-compiled.jsonl; fi; "
|
|
135
|
+
"if [ -f \"$trace_dir/change-evaluation.json\" ]; then cp \"$trace_dir/change-evaluation.json\" .cawdex/change-evaluation.json; fi; "
|
|
136
|
+
"if [ -f \"$trace_dir/submission-bundle-manifest.json\" ]; then cp \"$trace_dir/submission-bundle-manifest.json\" .cawdex/submission-bundle-manifest.json; fi; "
|
|
137
|
+
"fi; "
|
|
138
|
+
"if git rev-parse --is-inside-work-tree >/dev/null 2>&1; then "
|
|
139
|
+
"{ git diff --binary --no-ext-diff 2>/dev/null || true; "
|
|
140
|
+
"git diff --cached --binary --no-ext-diff 2>/dev/null || true; "
|
|
141
|
+
"git ls-files --others --exclude-standard -z 2>/dev/null | "
|
|
142
|
+
"while IFS= read -r -d '' f; do "
|
|
143
|
+
"git diff --no-index --binary --no-ext-diff -- /dev/null \"$f\" 2>/dev/null || true; "
|
|
144
|
+
"done; } | redact_cawdex_artifact > .cawdex/benchmark.patch; "
|
|
145
|
+
"git status --short 2>/dev/null | redact_cawdex_artifact > .cawdex/git-status.txt || true; "
|
|
146
|
+
"fi; "
|
|
147
|
+
"if [ -s .cawdex/benchmark.patch ]; then "
|
|
148
|
+
"echo '[cawdex] patch artifact: .cawdex/benchmark.patch'; "
|
|
149
|
+
"fi; "
|
|
150
|
+
"if [ -s .cawdex/benchmark-summary.json ]; then "
|
|
151
|
+
"echo '[cawdex] trace summary: .cawdex/benchmark-summary.json'; "
|
|
152
|
+
"fi; "
|
|
153
|
+
"if [ -s .cawdex/benchmark-trace.jsonl ]; then "
|
|
154
|
+
"echo '[cawdex] tool trace: .cawdex/benchmark-trace.jsonl'; "
|
|
155
|
+
"fi; "
|
|
156
|
+
"if [ -s .cawdex/agent-context-compiled.jsonl ]; then "
|
|
157
|
+
"echo '[cawdex] context compilation: .cawdex/agent-context-compiled.jsonl'; "
|
|
158
|
+
"fi; "
|
|
159
|
+
"if [ -s .cawdex/change-evaluation.json ]; then "
|
|
160
|
+
"echo '[cawdex] change evaluation: .cawdex/change-evaluation.json'; "
|
|
161
|
+
"fi; "
|
|
162
|
+
"if [ -s .cawdex/submission-bundle-manifest.json ]; then "
|
|
163
|
+
"echo '[cawdex] submission bundle: .cawdex/submission-bundle-manifest.json'; "
|
|
164
|
+
"fi; "
|
|
165
|
+
"exit \"$status\""
|
|
166
|
+
)
|
|
167
|
+
command = "bash -lc " + shlex.quote(script)
|
|
168
|
+
return [
|
|
169
|
+
TerminalCommand(
|
|
170
|
+
command=command,
|
|
171
|
+
max_timeout_sec=float("inf"),
|
|
172
|
+
block=True,
|
|
173
|
+
)
|
|
174
|
+
]
|