cawdex 1.35.74 → 1.35.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +5 -5
  2. package/bin/anycode.js +2 -2
  3. package/bin/cawdex.js +408 -408
  4. package/bin/ecc-hooks.cjs +11 -11
  5. package/dist/agents-md.d.ts +31 -0
  6. package/dist/agents-md.js +340 -0
  7. package/dist/agents-md.js.map +1 -0
  8. package/dist/agents.js +1424 -1424
  9. package/dist/api.d.ts +1 -0
  10. package/dist/api.js +19 -14
  11. package/dist/api.js.map +1 -1
  12. package/dist/autonomous-loops.js +287 -287
  13. package/dist/benchmark-repos.d.ts +31 -0
  14. package/dist/benchmark-repos.js +234 -8
  15. package/dist/benchmark-repos.js.map +1 -1
  16. package/dist/command-palette.js +4 -2
  17. package/dist/command-palette.js.map +1 -1
  18. package/dist/compaction.js +8 -8
  19. package/dist/config.js +51 -36
  20. package/dist/config.js.map +1 -1
  21. package/dist/content-engine.js +543 -543
  22. package/dist/context-brief.d.ts +4 -0
  23. package/dist/context-brief.js +230 -0
  24. package/dist/context-brief.js.map +1 -0
  25. package/dist/cost-tracker.d.ts +33 -14
  26. package/dist/cost-tracker.js +81 -19
  27. package/dist/cost-tracker.js.map +1 -1
  28. package/dist/coverage.js +39 -39
  29. package/dist/docs-sync.js +98 -98
  30. package/dist/evaluation.js +452 -452
  31. package/dist/fixed-footer.d.ts +7 -1
  32. package/dist/fixed-footer.js +92 -18
  33. package/dist/fixed-footer.js.map +1 -1
  34. package/dist/git-workflow.js +49 -49
  35. package/dist/index.d.ts +2 -0
  36. package/dist/index.js +197 -65
  37. package/dist/index.js.map +1 -1
  38. package/dist/instant-artifact.d.ts +6 -0
  39. package/dist/instant-artifact.js +397 -0
  40. package/dist/instant-artifact.js.map +1 -0
  41. package/dist/live-queue.js +1 -1
  42. package/dist/live-queue.js.map +1 -1
  43. package/dist/model-aliases.d.ts +37 -0
  44. package/dist/model-aliases.js +203 -0
  45. package/dist/model-aliases.js.map +1 -0
  46. package/dist/orchestration.js +15 -15
  47. package/dist/permissions.d.ts +6 -0
  48. package/dist/permissions.js +53 -0
  49. package/dist/permissions.js.map +1 -1
  50. package/dist/pm2-manager.js +26 -26
  51. package/dist/query.d.ts +0 -1
  52. package/dist/query.js +74 -39
  53. package/dist/query.js.map +1 -1
  54. package/dist/refactor.js +87 -87
  55. package/dist/repo-command.js +7 -1
  56. package/dist/repo-command.js.map +1 -1
  57. package/dist/search-first.js +92 -92
  58. package/dist/skill-create.js +100 -100
  59. package/dist/stitch.js +1 -1
  60. package/dist/system-prompt.d.ts +2 -1
  61. package/dist/system-prompt.js +10 -5
  62. package/dist/system-prompt.js.map +1 -1
  63. package/dist/tools/github-repo-digest.d.ts +1 -1
  64. package/dist/tools/github-repo-digest.js +38 -6
  65. package/dist/tools/github-repo-digest.js.map +1 -1
  66. package/dist/types.d.ts +3 -0
  67. package/dist/types.js.map +1 -1
  68. package/dist/verification.js +55 -55
  69. package/package.json +1 -1
  70. package/resources/__init__.py +1 -1
  71. package/resources/exgentic/cawdex_agent/README.md +114 -114
  72. package/resources/exgentic/cawdex_agent/__init__.py +5 -5
  73. package/resources/exgentic/cawdex_agent/agent.py +605 -605
  74. package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
  75. package/resources/exgentic/cawdex_agent/setup.sh +21 -21
  76. package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
  77. package/resources/hal/cawdex_agent/README.md +24 -24
  78. package/resources/hal/cawdex_agent/__init__.py +1 -1
  79. package/resources/hal/cawdex_agent/main.py +550 -550
  80. package/resources/hal/cawdex_agent/requirements.txt +2 -2
  81. package/resources/kbench/cawdex_agent/README.md +107 -107
  82. package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
  83. package/resources/kbench/cawdex_agent/runner.mjs +753 -753
  84. package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
  85. package/resources/terminal_bench/__init__.py +1 -1
  86. package/resources/terminal_bench/cawdex_agent.py +174 -174
  87. package/resources/terminal_bench/setup.sh +121 -121
@@ -1,2 +1,2 @@
1
- # Cawdex HAL adapter has no Python dependencies.
2
- # It shells out to the installed cawdex CLI.
1
+ # Cawdex HAL adapter has no Python dependencies.
2
+ # It shells out to the installed cawdex CLI.
@@ -1,107 +1,107 @@
1
- # Cawdex KBench Adapter
2
-
3
- This directory is a KBench `custom-adapter` for Cawdex.
4
-
5
- ```bash
6
- kbench run \
7
- --benchmark swe \
8
- --harness custom-adapter \
9
- --adapter /path/to/resources/kbench/cawdex_agent \
10
- --model-name openrouter/free \
11
- --instruction "Fix the bug"
12
- ```
13
-
14
- The runner reads the KBench JSON payload from `KBENCH_ADAPTER_INPUT` or stdin,
15
- invokes `cawdex --prompt "/benchmark ..."` in task mode, and emits one
16
- `AdapterRunnerOutput` JSON object to stdout.
17
- Known KBench slugs are mapped to benchmark profiles before dispatch:
18
- `swe`/`swe-bench`, `tb2`/`terminal-bench`, `terminalworld`/`terminal-world`,
19
- `swe-chain`,
20
- `swe-cycle`/`fullcycle`/`swe-judge`, `swe-ci`/`swecibench`, `swe-prbench`/`prbench`/`pr-review`, `tml-bench`/`tabular-ml`/`kaggle-ml`, `pi-bench`/`proactive-assistant`, `ci-repair`/`ci-repair-bench`, `roadmapbench`, `saasbench`,
21
- `swe-bench-mobile`, `webdevbench`/`swe-webdev-bench`, `appworld`, `browsecomp`/`browsecompplus`, and
22
- `tau2`/`tau-bench` use specialized prompts; unknown slugs use
23
- `generic`.
24
-
25
- The output includes redacted instruction/stdout/stderr artifact refs, native
26
- Cawdex trace refs, and redacted git patch/status refs when the task
27
- worktree is a git repo. If a native `summary.json` exists, compact verifier
28
- evidence, including parsed counts, compact failure signatures, and final-answer
29
- verification-claim plus incomplete/blocked completion evidence, usage/cost
30
- telemetry, cost-efficiency risk, invalid tool-action telemetry, task-contract checklist completion/no-edit/test-edit signals,
31
- task-alignment risk signals, spec-compliance risk signals, reward-hack risk signals, long-horizon coverage risk signals, Pi-Bench proactivity ledger signals, incomplete/inconclusive verifier markers,
32
- environment setup/reconstruction signals for missing dependencies, toolchains,
33
- or build artifacts, dependency manifest/lockfile setup-validation signals,
34
- HarnessAudit-style harness-safety signals for protected-resource access, external information transfer, destructive operations, and oracle access,
35
- candidate-file dossier signals for broad pre-edit inspection without a compact dossier,
36
- root-cause hypothesis signals for repair edits after failed verifiers without an explicit diagnosis,
37
- targeted-fix manifest signals for repair edits after failed verifiers without a fix plan,
38
- trajectory-cleanup signals for base64/data-URI blobs, high-entropy encoded output, duplicate output, and excessive truncation,
39
- skill-view fit/timing signals, per-target edit localization signals, large edit-surface
40
- signals, scratch/probe artifact signals, redundant tool-call signals,
41
- redundant failing-verifier rerun signals, blind-repair signals, post-edit regression-cycle signals,
42
- AHE publish-state mutation signals, latest post-edit verifier signals, post-edit and final-state
43
- diff-review signals, final-edit validation stability/lucky-pass signals, broad-validation signals,
44
- CI-derived validation signals,
45
- source-research recency signals,
46
- process-defect scoring, AHE-style change-evaluation verdicts, submission bundle manifest readiness/hash metadata,
47
- and trajectory-quality fields are copied to
48
- `benchmarkResult.traceSummary` for harness-side scoring. `benchmarkResult.usage`
49
- also aliases the native usage block for cost-aware leaderboards. Native verifier
50
- trace previews preserve both head and tail output so final test summaries survive
51
- noisy install/build logs. `benchmarkResult.experienceCard` includes bounded
52
- task-alignment/spec-compliance/reward-hack/harness-safety/long-horizon/proactivity risk blocks, component-observability edit classification for AHE-style surface attribution, including SWE-WebDevBench canary/frontend-backend/security validation signals, SWE-Cycle lifecycle/setup/test-generation/judge validation signals, SWE-CI evolution/checklist/CI-loop validation signals, and Pi-Bench context-contract/hidden-intent/clarification/privacy/completion evidence, root-cause hypothesis state and targeted-fix counts for failed-verifier repair edits, decision-observability predictions for edits and validation-reliability evidence
53
- for final verifier stability, broad validation, and CI-derived validation, plus
54
- context-utilization precision/miss evidence, candidate-dossier status, and
55
- trajectory-cleanup summaries for retrieval-aware scoring and avoiding noisy prior
56
- traces, plus run-efficiency action/usage/cost/time evidence for cost-aware scoring. Prior
57
- experience hints also expose compact source-research coverage, including
58
- hit/error counts, targeted/fresh coverage, recency windows, top URLs, and
59
- Kaggle fallback status. When present, `benchmarkResult.traceSummary` also
60
- includes the redacted ACC-style task/context/answer compilation from the native
61
- Cawdex trace for retrieval, replay, or training-data curation.
62
- It also includes `changeEvaluation` and `submissionBundleManifest` when present, so leaderboard
63
- submission tooling can inspect artifact hashes and missing official score/session
64
- fields without parsing the full summary.
65
-
66
- Inside benchmark mode, the read-only `benchmark_context` preflight also surfaces
67
- CI workflow run commands plus setup actions, env key names, service containers,
68
- job containers, and images from GitHub Actions, GitLab CI, CircleCI, Azure
69
- Pipelines, and Jenkins files. Env values are not printed. Agents can reconstruct
70
- the relevant CI environment and then reproduce project-native test/build/lint
71
- steps before finalizing.
72
- It also separates reusable prior local benchmark experience from similar failed
73
- or unsafe prior runs, so context reuse stays method-level and current verifier
74
- evidence remains authoritative. Pi-Bench-like tasks additionally prefer prior
75
- experience with complete context/hidden-intent/clarification/privacy/completion
76
- proactivity ledgers and surface incomplete ledgers as warnings. AHE change
77
- evaluations also participate in reuse: confirmed manifests can rank higher,
78
- while contradicted, regression-risk, pending-verification, missing-prediction,
79
- or missing-regression-forecast manifests are warnings rather than replay hints.
80
- Context-utilization evidence participates in reuse as well: concise runs whose
81
- inspected context was used by the eventual patch and whose pre-edit search was
82
- compressed into a candidate-file dossier can rank higher, while low-utilization,
83
- missing-dossier, or pre-edit context-bloat runs are warnings rather than replay
84
- hints.
85
- AHE-style cleanup evidence participates in reuse too: prior runs with encoded blobs,
86
- duplicate observations, or excessive truncation are surfaced as warnings instead of
87
- replay hints.
88
- AHE-style diagnosis evidence participates in reuse too: prior runs that repaired after
89
- failed verifiers without a root-cause hypothesis are surfaced as warnings instead of
90
- replay hints.
91
- AHE-style fix-plan evidence participates in reuse too: prior runs that repaired after
92
- failed verifiers without a targeted-fix manifest are surfaced as warnings instead of
93
- replay hints.
94
- Interactive type-ahead is preserved across active-turn cancellation and
95
- permission interruptions, so user drafts return to the prompt instead of being
96
- silently submitted while the harness is still running.
97
-
98
- Useful env vars:
99
-
100
- - `CAWDEX_KBENCH_COMMAND` or `CAWDEX_KBENCH_COMMAND`: command used to launch Cawdex, default `cawdex`.
101
- - `CAWDEX_KBENCH_PERMISSION`: permission flag value, default `yolo`.
102
- - `CAWDEX_KBENCH_EXTRA_ARGS`: extra Cawdex CLI flags.
103
- - `CAWDEX_KBENCH_ARTIFACT_DIR`: directory for redacted instruction/stdout/stderr and trace files.
104
- - `CAWDEX_BASH_TIMEOUT_MS`: default Cawdex `bash` tool timeout; the adapter defaults to `300000` when unset.
105
-
106
- Provider keys should be passed via normal Cawdex env config or KBench's
107
- `--api-key-env`, which the runner forwards as Cawdex `--api-key-env`.
1
+ # Cawdex KBench Adapter
2
+
3
+ This directory is a KBench `custom-adapter` for Cawdex.
4
+
5
+ ```bash
6
+ kbench run \
7
+ --benchmark swe \
8
+ --harness custom-adapter \
9
+ --adapter /path/to/resources/kbench/cawdex_agent \
10
+ --model-name openrouter/free \
11
+ --instruction "Fix the bug"
12
+ ```
13
+
14
+ The runner reads the KBench JSON payload from `KBENCH_ADAPTER_INPUT` or stdin,
15
+ invokes `cawdex --prompt "/benchmark ..."` in task mode, and emits one
16
+ `AdapterRunnerOutput` JSON object to stdout.
17
+ Known KBench slugs are mapped to benchmark profiles before dispatch:
18
+ `swe`/`swe-bench`, `tb2`/`terminal-bench`, `terminalworld`/`terminal-world`,
19
+ `swe-chain`,
20
+ `swe-cycle`/`fullcycle`/`swe-judge`, `swe-ci`/`swecibench`, `swe-prbench`/`prbench`/`pr-review`, `tml-bench`/`tabular-ml`/`kaggle-ml`, `pi-bench`/`proactive-assistant`, `ci-repair`/`ci-repair-bench`, `roadmapbench`, `saasbench`,
21
+ `swe-bench-mobile`, `webdevbench`/`swe-webdev-bench`, `appworld`, `browsecomp`/`browsecompplus`, and
22
+ `tau2`/`tau-bench` use specialized prompts; unknown slugs use
23
+ `generic`.
24
+
25
+ The output includes redacted instruction/stdout/stderr artifact refs, native
26
+ Cawdex trace refs, and redacted git patch/status refs when the task
27
+ worktree is a git repo. If a native `summary.json` exists, compact verifier
28
+ evidence, including parsed counts, compact failure signatures, and final-answer
29
+ verification-claim plus incomplete/blocked completion evidence, usage/cost
30
+ telemetry, cost-efficiency risk, invalid tool-action telemetry, task-contract checklist completion/no-edit/test-edit signals,
31
+ task-alignment risk signals, spec-compliance risk signals, reward-hack risk signals, long-horizon coverage risk signals, Pi-Bench proactivity ledger signals, incomplete/inconclusive verifier markers,
32
+ environment setup/reconstruction signals for missing dependencies, toolchains,
33
+ or build artifacts, dependency manifest/lockfile setup-validation signals,
34
+ HarnessAudit-style harness-safety signals for protected-resource access, external information transfer, destructive operations, and oracle access,
35
+ candidate-file dossier signals for broad pre-edit inspection without a compact dossier,
36
+ root-cause hypothesis signals for repair edits after failed verifiers without an explicit diagnosis,
37
+ targeted-fix manifest signals for repair edits after failed verifiers without a fix plan,
38
+ trajectory-cleanup signals for base64/data-URI blobs, high-entropy encoded output, duplicate output, and excessive truncation,
39
+ skill-view fit/timing signals, per-target edit localization signals, large edit-surface
40
+ signals, scratch/probe artifact signals, redundant tool-call signals,
41
+ redundant failing-verifier rerun signals, blind-repair signals, post-edit regression-cycle signals,
42
+ AHE publish-state mutation signals, latest post-edit verifier signals, post-edit and final-state
43
+ diff-review signals, final-edit validation stability/lucky-pass signals, broad-validation signals,
44
+ CI-derived validation signals,
45
+ source-research recency signals,
46
+ process-defect scoring, AHE-style change-evaluation verdicts, submission bundle manifest readiness/hash metadata,
47
+ and trajectory-quality fields are copied to
48
+ `benchmarkResult.traceSummary` for harness-side scoring. `benchmarkResult.usage`
49
+ also aliases the native usage block for cost-aware leaderboards. Native verifier
50
+ trace previews preserve both head and tail output so final test summaries survive
51
+ noisy install/build logs. `benchmarkResult.experienceCard` includes bounded
52
+ task-alignment/spec-compliance/reward-hack/harness-safety/long-horizon/proactivity risk blocks, component-observability edit classification for AHE-style surface attribution, including SWE-WebDevBench canary/frontend-backend/security validation signals, SWE-Cycle lifecycle/setup/test-generation/judge validation signals, SWE-CI evolution/checklist/CI-loop validation signals, and Pi-Bench context-contract/hidden-intent/clarification/privacy/completion evidence, root-cause hypothesis state and targeted-fix counts for failed-verifier repair edits, decision-observability predictions for edits and validation-reliability evidence
53
+ for final verifier stability, broad validation, and CI-derived validation, plus
54
+ context-utilization precision/miss evidence, candidate-dossier status, and
55
+ trajectory-cleanup summaries for retrieval-aware scoring and avoiding noisy prior
56
+ traces, plus run-efficiency action/usage/cost/time evidence for cost-aware scoring. Prior
57
+ experience hints also expose compact source-research coverage, including
58
+ hit/error counts, targeted/fresh coverage, recency windows, top URLs, and
59
+ Kaggle fallback status. When present, `benchmarkResult.traceSummary` also
60
+ includes the redacted ACC-style task/context/answer compilation from the native
61
+ Cawdex trace for retrieval, replay, or training-data curation.
62
+ It also includes `changeEvaluation` and `submissionBundleManifest` when present, so leaderboard
63
+ submission tooling can inspect artifact hashes and missing official score/session
64
+ fields without parsing the full summary.
65
+
66
+ Inside benchmark mode, the read-only `benchmark_context` preflight also surfaces
67
+ CI workflow run commands plus setup actions, env key names, service containers,
68
+ job containers, and images from GitHub Actions, GitLab CI, CircleCI, Azure
69
+ Pipelines, and Jenkins files. Env values are not printed. Agents can reconstruct
70
+ the relevant CI environment and then reproduce project-native test/build/lint
71
+ steps before finalizing.
72
+ It also separates reusable prior local benchmark experience from similar failed
73
+ or unsafe prior runs, so context reuse stays method-level and current verifier
74
+ evidence remains authoritative. Pi-Bench-like tasks additionally prefer prior
75
+ experience with complete context/hidden-intent/clarification/privacy/completion
76
+ proactivity ledgers and surface incomplete ledgers as warnings. AHE change
77
+ evaluations also participate in reuse: confirmed manifests can rank higher,
78
+ while contradicted, regression-risk, pending-verification, missing-prediction,
79
+ or missing-regression-forecast manifests are warnings rather than replay hints.
80
+ Context-utilization evidence participates in reuse as well: concise runs whose
81
+ inspected context was used by the eventual patch and whose pre-edit search was
82
+ compressed into a candidate-file dossier can rank higher, while low-utilization,
83
+ missing-dossier, or pre-edit context-bloat runs are warnings rather than replay
84
+ hints.
85
+ AHE-style cleanup evidence participates in reuse too: prior runs with encoded blobs,
86
+ duplicate observations, or excessive truncation are surfaced as warnings instead of
87
+ replay hints.
88
+ AHE-style diagnosis evidence participates in reuse too: prior runs that repaired after
89
+ failed verifiers without a root-cause hypothesis are surfaced as warnings instead of
90
+ replay hints.
91
+ AHE-style fix-plan evidence participates in reuse too: prior runs that repaired after
92
+ failed verifiers without a targeted-fix manifest are surfaced as warnings instead of
93
+ replay hints.
94
+ Interactive type-ahead is preserved across active-turn cancellation and
95
+ permission interruptions, so user drafts return to the prompt instead of being
96
+ silently submitted while the harness is still running.
97
+
98
+ Useful env vars:
99
+
100
+ - `CAWDEX_KBENCH_COMMAND` or `CAWDEX_KBENCH_COMMAND`: command used to launch Cawdex, default `cawdex`.
101
+ - `CAWDEX_KBENCH_PERMISSION`: permission flag value, default `yolo`.
102
+ - `CAWDEX_KBENCH_EXTRA_ARGS`: extra Cawdex CLI flags.
103
+ - `CAWDEX_KBENCH_ARTIFACT_DIR`: directory for redacted instruction/stdout/stderr and trace files.
104
+ - `CAWDEX_BASH_TIMEOUT_MS`: default Cawdex `bash` tool timeout; the adapter defaults to `300000` when unset.
105
+
106
+ Provider keys should be passed via normal Cawdex env config or KBench's
107
+ `--api-key-env`, which the runner forwards as Cawdex `--api-key-env`.
@@ -1,19 +1,19 @@
1
- {
2
- "schemaVersion": "kbench.adapter/v1",
3
- "id": "cawdex",
4
- "kind": "node",
5
- "entry": "./runner.mjs",
6
- "version": "0.1.0",
7
- "supportedBenchmarks": ["swe", "tb2", "sae"],
8
- "capabilities": {
9
- "runModes": ["task"],
10
- "machineReadableStdout": true,
11
- "supportsPatchOutput": false,
12
- "supportsTrajectory": true,
13
- "supportsToolCallTrace": true,
14
- "supportsResume": false,
15
- "supportsImages": false,
16
- "supportsSandboxBridge": false,
17
- "supportsPromptTemplate": false
18
- }
19
- }
1
+ {
2
+ "schemaVersion": "kbench.adapter/v1",
3
+ "id": "cawdex",
4
+ "kind": "node",
5
+ "entry": "./runner.mjs",
6
+ "version": "0.1.0",
7
+ "supportedBenchmarks": ["swe", "tb2", "sae"],
8
+ "capabilities": {
9
+ "runModes": ["task"],
10
+ "machineReadableStdout": true,
11
+ "supportsPatchOutput": false,
12
+ "supportsTrajectory": true,
13
+ "supportsToolCallTrace": true,
14
+ "supportsResume": false,
15
+ "supportsImages": false,
16
+ "supportsSandboxBridge": false,
17
+ "supportsPromptTemplate": false
18
+ }
19
+ }