cawdex 1.35.74 → 1.35.76

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/README.md +5 -5
  2. package/bin/anycode.js +2 -2
  3. package/bin/cawdex.js +408 -408
  4. package/bin/ecc-hooks.cjs +11 -11
  5. package/dist/agents-md.d.ts +31 -0
  6. package/dist/agents-md.js +340 -0
  7. package/dist/agents-md.js.map +1 -0
  8. package/dist/agents.js +1424 -1424
  9. package/dist/api.d.ts +1 -0
  10. package/dist/api.js +19 -14
  11. package/dist/api.js.map +1 -1
  12. package/dist/autonomous-loops.js +287 -287
  13. package/dist/benchmark-repos.d.ts +31 -0
  14. package/dist/benchmark-repos.js +234 -8
  15. package/dist/benchmark-repos.js.map +1 -1
  16. package/dist/command-palette.js +4 -2
  17. package/dist/command-palette.js.map +1 -1
  18. package/dist/compaction.js +8 -8
  19. package/dist/config.js +51 -36
  20. package/dist/config.js.map +1 -1
  21. package/dist/content-engine.js +543 -543
  22. package/dist/context-brief.d.ts +4 -0
  23. package/dist/context-brief.js +230 -0
  24. package/dist/context-brief.js.map +1 -0
  25. package/dist/cost-tracker.d.ts +33 -14
  26. package/dist/cost-tracker.js +81 -19
  27. package/dist/cost-tracker.js.map +1 -1
  28. package/dist/coverage.js +39 -39
  29. package/dist/docs-sync.js +98 -98
  30. package/dist/evaluation.js +452 -452
  31. package/dist/fixed-footer.d.ts +7 -1
  32. package/dist/fixed-footer.js +92 -18
  33. package/dist/fixed-footer.js.map +1 -1
  34. package/dist/git-workflow.js +49 -49
  35. package/dist/index.d.ts +2 -0
  36. package/dist/index.js +197 -65
  37. package/dist/index.js.map +1 -1
  38. package/dist/instant-artifact.d.ts +6 -0
  39. package/dist/instant-artifact.js +397 -0
  40. package/dist/instant-artifact.js.map +1 -0
  41. package/dist/live-queue.js +1 -1
  42. package/dist/live-queue.js.map +1 -1
  43. package/dist/model-aliases.d.ts +37 -0
  44. package/dist/model-aliases.js +203 -0
  45. package/dist/model-aliases.js.map +1 -0
  46. package/dist/orchestration.js +15 -15
  47. package/dist/permissions.d.ts +6 -0
  48. package/dist/permissions.js +53 -0
  49. package/dist/permissions.js.map +1 -1
  50. package/dist/pm2-manager.js +26 -26
  51. package/dist/query.d.ts +0 -1
  52. package/dist/query.js +74 -39
  53. package/dist/query.js.map +1 -1
  54. package/dist/refactor.js +87 -87
  55. package/dist/repo-command.js +7 -1
  56. package/dist/repo-command.js.map +1 -1
  57. package/dist/search-first.js +92 -92
  58. package/dist/skill-create.js +100 -100
  59. package/dist/stitch.js +1 -1
  60. package/dist/system-prompt.d.ts +2 -1
  61. package/dist/system-prompt.js +10 -5
  62. package/dist/system-prompt.js.map +1 -1
  63. package/dist/tools/github-repo-digest.d.ts +1 -1
  64. package/dist/tools/github-repo-digest.js +38 -6
  65. package/dist/tools/github-repo-digest.js.map +1 -1
  66. package/dist/types.d.ts +3 -0
  67. package/dist/types.js.map +1 -1
  68. package/dist/verification.js +55 -55
  69. package/package.json +1 -1
  70. package/resources/__init__.py +1 -1
  71. package/resources/exgentic/cawdex_agent/README.md +114 -114
  72. package/resources/exgentic/cawdex_agent/__init__.py +5 -5
  73. package/resources/exgentic/cawdex_agent/agent.py +605 -605
  74. package/resources/exgentic/cawdex_agent/requirements.txt +2 -2
  75. package/resources/exgentic/cawdex_agent/setup.sh +21 -21
  76. package/resources/exgentic/cawdex_agent/utils.py +1061 -1061
  77. package/resources/hal/cawdex_agent/README.md +24 -24
  78. package/resources/hal/cawdex_agent/__init__.py +1 -1
  79. package/resources/hal/cawdex_agent/main.py +550 -550
  80. package/resources/hal/cawdex_agent/requirements.txt +2 -2
  81. package/resources/kbench/cawdex_agent/README.md +107 -107
  82. package/resources/kbench/cawdex_agent/adapter.manifest.json +19 -19
  83. package/resources/kbench/cawdex_agent/runner.mjs +753 -753
  84. package/resources/open_agent_leaderboard/cawdex-agent-card.md +119 -119
  85. package/resources/terminal_bench/__init__.py +1 -1
  86. package/resources/terminal_bench/cawdex_agent.py +174 -174
  87. package/resources/terminal_bench/setup.sh +121 -121
@@ -1,753 +1,753 @@
1
- #!/usr/bin/env node
2
- import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs';
3
- import { tmpdir } from 'node:os';
4
- import { dirname, join } from 'node:path';
5
- import { spawnSync } from 'node:child_process';
6
-
7
- const startedAt = Date.now();
8
-
9
- function emit(output) {
10
- process.stdout.write(JSON.stringify(output));
11
- }
12
-
13
- function fail(status, message, extra = {}) {
14
- emit({
15
- ok: false,
16
- status,
17
- failureKind: status,
18
- finalText: message,
19
- elapsedMs: Date.now() - startedAt,
20
- artifacts: [],
21
- error: { message },
22
- ...extra,
23
- });
24
- }
25
-
26
- function readInput() {
27
- const inputPath = process.env.KBENCH_ADAPTER_INPUT;
28
- const raw = inputPath ? readFileSync(inputPath, 'utf8') : readFileSync(0, 'utf8');
29
- return JSON.parse(raw);
30
- }
31
-
32
- function redact(text) {
33
- return String(text || '')
34
- .replace(/sk-or-v1-[A-Za-z0-9_-]+/g, 'sk-or-v1-[REDACTED]')
35
- .replace(/sk-[A-Za-z0-9_-]{16,}/g, 'sk-[REDACTED]')
36
- .replace(/hf_[A-Za-z0-9]{16,}/g, 'hf_[REDACTED]')
37
- .replace(/KGAT_[A-Za-z0-9]{16,}/g, 'KGAT_[REDACTED]')
38
- .replace(/npm_[A-Za-z0-9]{16,}/g, 'npm_[REDACTED]');
39
- }
40
-
41
- function truncate(text, max) {
42
- const safe = redact(text);
43
- if (safe.length <= max) return safe;
44
- return `${safe.slice(0, max - 80)}\n...[truncated ${safe.length - (max - 80)} chars]`;
45
- }
46
-
47
- function splitCommand(command) {
48
- const parts = [];
49
- let cur = '';
50
- let quote = null;
51
- let escaped = false;
52
- for (const ch of command.trim()) {
53
- if (escaped) {
54
- cur += ch;
55
- escaped = false;
56
- continue;
57
- }
58
- if (ch === '\\' && quote) {
59
- escaped = true;
60
- continue;
61
- }
62
- if (quote) {
63
- if (ch === quote) quote = null;
64
- else cur += ch;
65
- continue;
66
- }
67
- if (ch === '"' || ch === "'") {
68
- quote = ch;
69
- continue;
70
- }
71
- if (/\s/.test(ch)) {
72
- if (cur) {
73
- parts.push(cur);
74
- cur = '';
75
- }
76
- continue;
77
- }
78
- cur += ch;
79
- }
80
- if (cur) parts.push(cur);
81
- return parts;
82
- }
83
-
84
- function profileForBenchmark(benchmark) {
85
- const slug = String(benchmark || '').toLowerCase().replace(/[^a-z0-9]+/g, '');
86
- if (slug === 'swe' || slug === 'swebench') return 'swe-bench';
87
- if (slug === 'tb2' || slug === 'terminalbench') return 'terminal-bench';
88
- if (slug === 'terminalworld' || slug === 'terminalworldbench' || slug === 'tw' || slug === 'tworld') return 'terminalworld';
89
- if (slug === 'swechain' || slug === 'chain' || slug === 'upgrade') return 'swe-chain';
90
- if (slug === 'swecycle' || slug === 'swecyclebench' || slug === 'fullcycle' || slug === 'swejudge') return 'swe-cycle';
91
- if (slug === 'sweci' || slug === 'swecibench') return 'swe-ci';
92
- if (slug === 'swepr' || slug === 'sweprbench' || slug === 'prbench' || slug === 'prreview' || slug === 'pullrequestreview' || slug === 'codereviewbench') return 'swe-prbench';
93
- if (slug === 'tml' || slug === 'tmlbench' || slug === 'tabularml' || slug === 'kaggleml' || slug === 'kagglebench' || slug === 'datascience') return 'tml-bench';
94
- if (slug === 'pi' || slug === 'pibench' || slug === 'proactive' || slug === 'proactiveassistant' || slug === 'personalassistant' || slug === 'hiddenintent') return 'pi-bench';
95
- if (slug === 'cirepair' || slug === 'cirepairbench' || slug === 'ci') return 'ci-repair';
96
- if (slug === 'wildclaw' || slug === 'wildclawbench' || slug === 'wcbench') return 'wildclaw';
97
- if (slug === 'arc' || slug === 'arcagi' || slug === 'arcagi3' || slug === 'arcprize') return 'arc-agi';
98
- if (slug === 'spec' || slug === 'specbench' || slug === 'speccompliance') return 'specbench';
99
- if (slug === 'rhb' || slug === 'rewardhack' || slug === 'rewardhacking' || slug === 'rewardhackingagents') return 'reward-hacking';
100
- if (slug === 'roadmap' || slug === 'roadmapbench' || slug === 'longhorizon' || slug === 'versionupgrade') return 'roadmapbench';
101
- if (slug === 'saas' || slug === 'saasbench' || slug === 'enterprise') return 'saasbench';
102
- if (slug === 'mobile' || slug === 'swebenchmobile' || slug === 'swemobile' || slug === 'ios') return 'swe-bench-mobile';
103
- if (slug === 'webdev' || slug === 'webdevbench' || slug === 'swewebdev' || slug === 'swewebdevbench' || slug === 'vibecoding') return 'webdevbench';
104
- if (slug === 'app' || slug === 'appworld' || slug === 'appworldbench') return 'appworld';
105
- if (slug === 'browsecomp' || slug === 'browsecompplus' || slug === 'deepresearch' || slug === 'webresearch') return 'browsecomp';
106
- if (slug === 'tau' || slug === 'tau2' || slug === 'taubench' || slug === 'taubench2' || slug.startsWith('tau2') || slug.startsWith('taubench')) return 'tau2';
107
- return 'generic';
108
- }
109
-
110
- function collectTraceRefs(traceDir) {
111
- const refs = [];
112
- if (!traceDir || !existsSync(traceDir)) return refs;
113
- const stack = [traceDir];
114
- while (stack.length) {
115
- const dir = stack.pop();
116
- for (const entry of readdirSync(dir, { withFileTypes: true })) {
117
- const full = join(dir, entry.name);
118
- if (entry.isDirectory()) {
119
- stack.push(full);
120
- } else if (entry.name === 'summary.json' || entry.name === 'trace.jsonl') {
121
- refs.push({
122
- kind: entry.name === 'trace.jsonl' ? 'cawdex-tool-trace' : 'cawdex-summary',
123
- path: full,
124
- contentType: entry.name.endsWith('.jsonl') ? 'application/jsonl' : 'application/json',
125
- description: `Cawdex ${entry.name}`,
126
- });
127
- }
128
- }
129
- }
130
- return refs;
131
- }
132
-
133
- function readLatestTraceSummary(traceDir) {
134
- if (!traceDir || !existsSync(traceDir)) return null;
135
- const summaries = [];
136
- const stack = [traceDir];
137
- while (stack.length) {
138
- const dir = stack.pop();
139
- for (const entry of readdirSync(dir, { withFileTypes: true })) {
140
- const full = join(dir, entry.name);
141
- if (entry.isDirectory()) {
142
- stack.push(full);
143
- } else if (entry.name === 'summary.json') {
144
- try {
145
- const summary = JSON.parse(readFileSync(full, 'utf8'));
146
- const endedAtMs = Number.isFinite(Date.parse(summary.endedAt)) ? Date.parse(summary.endedAt) : 0;
147
- const mtimeMs = statSync(full).mtimeMs;
148
- summaries.push({ path: full, summary, sortKey: endedAtMs || mtimeMs });
149
- } catch {
150
- // Ignore malformed or partially-written trace summaries.
151
- }
152
- }
153
- }
154
- }
155
- summaries.sort((a, b) => b.sortKey - a.sortKey);
156
- return summaries[0] || null;
157
- }
158
-
159
- function compactTraceSummary(traceSummary) {
160
- if (!traceSummary) return undefined;
161
- const summary = traceSummary.summary || {};
162
- const quality = summary.trajectoryQuality || {};
163
- const experienceCard = compactExperienceCard(summary.experienceCard);
164
- return {
165
- path: traceSummary.path,
166
- verificationCount: summary.verificationCount,
167
- verificationCommands: Array.isArray(summary.verificationCommands) ? summary.verificationCommands.slice(0, 20) : [],
168
- verificationEvidence: summary.verificationEvidence,
169
- finalAnswerEvidence: summary.finalAnswerEvidence,
170
- usage: summary.usage,
171
- experienceCard,
172
- agentContextCompilation: compactAgentContextCompilation(summary.agentContextCompilation),
173
- changeEvaluation: compactChangeEvaluation(summary.changeEvaluation),
174
- submissionBundleManifest: compactSubmissionBundleManifest(summary.submissionBundleManifest),
175
- changedFiles: Array.isArray(summary.changedFiles) ? summary.changedFiles.slice(0, 100) : [],
176
- worktreeChangedFiles: Array.isArray(summary.worktreeChangedFiles) ? summary.worktreeChangedFiles.slice(0, 100) : [],
177
- artifacts: Array.isArray(summary.artifacts) ? summary.artifacts.slice(0, 20) : [],
178
- trajectoryQuality: {
179
- benchmarkContextUsed: quality.benchmarkContextUsed,
180
- usageCallCount: quality.usageCallCount,
181
- usageTotalTokens: quality.usageTotalTokens,
182
- usageEstimatedCostUsd: quality.usageEstimatedCostUsd,
183
- costEfficiencyRisk: quality.costEfficiencyRisk,
184
- totalToolElapsedMs: quality.totalToolElapsedMs,
185
- maxToolElapsedMs: quality.maxToolElapsedMs,
186
- slowToolCallCount: quality.slowToolCallCount,
187
- slowToolEvents: Array.isArray(quality.slowToolEvents) ? quality.slowToolEvents.slice(0, 20) : [],
188
- timeEfficiencyRisk: quality.timeEfficiencyRisk,
189
- invalidToolActionCount: quality.invalidToolActionCount,
190
- invalidToolActionPercent: quality.invalidToolActionPercent,
191
- invalidToolActionEvents: Array.isArray(quality.invalidToolActionEvents) ? quality.invalidToolActionEvents.slice(0, 20) : [],
192
- localizationBeforeFirstEdit: quality.localizationBeforeFirstEdit,
193
- failingReproductionBeforeFirstEdit: quality.failingReproductionBeforeFirstEdit,
194
- passingValidationAfterFirstEdit: quality.passingValidationAfterFirstEdit,
195
- validationAfterLastEdit: quality.validationAfterLastEdit,
196
- passingValidationAfterLastEdit: quality.passingValidationAfterLastEdit,
197
- finalEditVerificationCount: quality.finalEditVerificationCount,
198
- finalEditPassingVerificationCount: quality.finalEditPassingVerificationCount,
199
- stableValidationAfterLastEdit: quality.stableValidationAfterLastEdit,
200
- broadValidationAfterLastEdit: quality.broadValidationAfterLastEdit,
201
- passingBroadValidationAfterLastEdit: quality.passingBroadValidationAfterLastEdit,
202
- successfulVerificationCount: quality.successfulVerificationCount,
203
- failedVerificationCount: quality.failedVerificationCount,
204
- incompleteVerifierCount: quality.incompleteVerifierCount,
205
- incompleteVerifierEvents: Array.isArray(quality.incompleteVerifierEvents) ? quality.incompleteVerifierEvents.slice(0, 20) : [],
206
- inconclusiveVerifierEvents: Array.isArray(quality.inconclusiveVerifierEvents) ? quality.inconclusiveVerifierEvents.slice(0, 20) : [],
207
- environmentSetupFailureCount: quality.environmentSetupFailureCount,
208
- environmentSetupFailureEvents: Array.isArray(quality.environmentSetupFailureEvents) ? quality.environmentSetupFailureEvents.slice(0, 20) : [],
209
- unresolvedEnvironmentSetupFailureCount: quality.unresolvedEnvironmentSetupFailureCount,
210
- unresolvedEnvironmentSetupFailureEvents: Array.isArray(quality.unresolvedEnvironmentSetupFailureEvents) ? quality.unresolvedEnvironmentSetupFailureEvents.slice(0, 20) : [],
211
- environmentSetupCount: quality.environmentSetupCount,
212
- successfulEnvironmentSetupCount: quality.successfulEnvironmentSetupCount,
213
- environmentSetupEvents: Array.isArray(quality.environmentSetupEvents) ? quality.environmentSetupEvents.slice(0, 20) : [],
214
- dependencyManifestEditCount: quality.dependencyManifestEditCount,
215
- dependencyLockfileEditCount: quality.dependencyLockfileEditCount,
216
- dependencyManifestEditEvents: Array.isArray(quality.dependencyManifestEditEvents) ? quality.dependencyManifestEditEvents.slice(0, 20) : [],
217
- dependencyLockfileEditEvents: Array.isArray(quality.dependencyLockfileEditEvents) ? quality.dependencyLockfileEditEvents.slice(0, 20) : [],
218
- dependencySetupAfterManifestEdit: quality.dependencySetupAfterManifestEdit,
219
- passingDependencySetupAfterManifestEdit: quality.passingDependencySetupAfterManifestEdit,
220
- dependencyValidationAfterManifestEdit: quality.dependencyValidationAfterManifestEdit,
221
- passingDependencyValidationAfterManifestEdit: quality.passingDependencyValidationAfterManifestEdit,
222
- firstDependencySetupAfterManifestEditSeq: quality.firstDependencySetupAfterManifestEditSeq,
223
- firstDependencyValidationAfterManifestEditSeq: quality.firstDependencyValidationAfterManifestEditSeq,
224
- skillViewCount: quality.skillViewCount,
225
- skillViewEvents: Array.isArray(quality.skillViewEvents) ? quality.skillViewEvents.slice(0, 20) : [],
226
- skillNames: Array.isArray(quality.skillNames) ? quality.skillNames.slice(0, 20) : [],
227
- skillLoadedBeforeLocalContext: quality.skillLoadedBeforeLocalContext,
228
- excessiveSkillViewCount: quality.excessiveSkillViewCount,
229
- ciWorkflowCommandCount: quality.ciWorkflowCommandCount,
230
- ciVerifierCommands: Array.isArray(quality.ciVerifierCommands) ? quality.ciVerifierCommands.slice(0, 20) : [],
231
- ciValidationAfterFirstEdit: quality.ciValidationAfterFirstEdit,
232
- passingCiValidationAfterFirstEdit: quality.passingCiValidationAfterFirstEdit,
233
- ciValidationAfterLastEdit: quality.ciValidationAfterLastEdit,
234
- passingCiValidationAfterLastEdit: quality.passingCiValidationAfterLastEdit,
235
- firstCiValidationAfterFirstEditSeq: quality.firstCiValidationAfterFirstEditSeq,
236
- sourceResearchCoverage: quality.sourceResearchCoverage,
237
- taskContractSignalCount: quality.taskContractSignalCount,
238
- taskContractChecklistUsed: quality.taskContractChecklistUsed,
239
- taskContractChecklistAfterContext: quality.taskContractChecklistAfterContext,
240
- taskContractChecklistComplete: quality.taskContractChecklistComplete,
241
- latestTodoSeq: quality.latestTodoSeq,
242
- todoIncompleteCount: quality.todoIncompleteCount,
243
- todoIncompleteItems: Array.isArray(quality.todoIncompleteItems) ? quality.todoIncompleteItems.slice(0, 20) : [],
244
- taskAlignmentRisk: quality.taskAlignmentRisk,
245
- taskAlignmentSignalCount: quality.taskAlignmentSignalCount,
246
- taskAlignmentSignals: Array.isArray(quality.taskAlignmentSignals) ? quality.taskAlignmentSignals.slice(0, 20) : [],
247
- specComplianceRisk: quality.specComplianceRisk,
248
- specComplianceSignalCount: quality.specComplianceSignalCount,
249
- specComplianceSignals: Array.isArray(quality.specComplianceSignals) ? quality.specComplianceSignals.slice(0, 20) : [],
250
- rewardHackRisk: quality.rewardHackRisk,
251
- rewardHackSignalCount: quality.rewardHackSignalCount,
252
- rewardHackSignals: Array.isArray(quality.rewardHackSignals) ? quality.rewardHackSignals.slice(0, 20) : [],
253
- longHorizonRisk: quality.longHorizonRisk,
254
- longHorizonSignalCount: quality.longHorizonSignalCount,
255
- longHorizonSignals: Array.isArray(quality.longHorizonSignals) ? quality.longHorizonSignals.slice(0, 20) : [],
256
- proactivityDetected: quality.proactivityDetected,
257
- proactivityRisk: quality.proactivityRisk,
258
- proactivitySignalCount: quality.proactivitySignalCount,
259
- proactivitySignals: Array.isArray(quality.proactivitySignals) ? quality.proactivitySignals.slice(0, 20) : [],
260
- proactivityContextContract: quality.proactivityContextContract,
261
- proactivityHiddenIntentEvidence: quality.proactivityHiddenIntentEvidence,
262
- proactivityClarificationEvidence: quality.proactivityClarificationEvidence,
263
- proactivityPrivacyEvidence: quality.proactivityPrivacyEvidence,
264
- proactivityCompletionEvidence: quality.proactivityCompletionEvidence,
265
- proactivityActionCount: quality.proactivityActionCount,
266
- noEditContractDetected: quality.noEditContractDetected,
267
- editAfterNoEditContract: quality.editAfterNoEditContract,
268
- lastEditSeq: quality.lastEditSeq,
269
- editTargetCount: quality.editTargetCount,
270
- localizedEditTargetCount: quality.localizedEditTargetCount,
271
- unlocalizedEditTargetEvents: Array.isArray(quality.unlocalizedEditTargetEvents) ? quality.unlocalizedEditTargetEvents.slice(0, 20) : [],
272
- contextUtilizationInspectCount: quality.contextUtilizationInspectCount,
273
- contextUtilizationHitCount: quality.contextUtilizationHitCount,
274
- contextUtilizationMissCount: quality.contextUtilizationMissCount,
275
- contextUtilizationPercent: quality.contextUtilizationPercent,
276
- contextUtilizationRisk: quality.contextUtilizationRisk,
277
- contextUtilizationMissEvents: Array.isArray(quality.contextUtilizationMissEvents) ? quality.contextUtilizationMissEvents.slice(0, 20) : [],
278
- preEditContextInspectCount: quality.preEditContextInspectCount,
279
- preEditContextHitCount: quality.preEditContextHitCount,
280
- preEditContextMissCount: quality.preEditContextMissCount,
281
- preEditContextUtilizationPercent: quality.preEditContextUtilizationPercent,
282
- contextBloatRisk: quality.contextBloatRisk,
283
- contextBloatEventCount: quality.contextBloatEventCount,
284
- contextBloatEvents: Array.isArray(quality.contextBloatEvents) ? quality.contextBloatEvents.slice(0, 20) : [],
285
- evidenceGroundingRisk: quality.evidenceGroundingRisk,
286
- evidenceGroundingEventCount: quality.evidenceGroundingEventCount,
287
- evidenceGroundingEvents: Array.isArray(quality.evidenceGroundingEvents) ? quality.evidenceGroundingEvents.slice(0, 20) : [],
288
- broadEditContractDetected: quality.broadEditContractDetected,
289
- largeEditSurfaceTargetCount: quality.largeEditSurfaceTargetCount,
290
- largeEditSurfaceTargets: Array.isArray(quality.largeEditSurfaceTargets) ? quality.largeEditSurfaceTargets.slice(0, 40) : [],
291
- redundantToolCallCount: quality.redundantToolCallCount,
292
- redundantToolCallEvents: Array.isArray(quality.redundantToolCallEvents) ? quality.redundantToolCallEvents.slice(0, 20) : [],
293
- redundantVerifierCount: quality.redundantVerifierCount,
294
- redundantVerifierEvents: Array.isArray(quality.redundantVerifierEvents) ? quality.redundantVerifierEvents.slice(0, 20) : [],
295
- blindRepairCount: quality.blindRepairCount,
296
- blindRepairEvents: Array.isArray(quality.blindRepairEvents) ? quality.blindRepairEvents.slice(0, 20) : [],
297
- failureAlignedRepairCount: quality.failureAlignedRepairCount,
298
- failureUnalignedRepairCount: quality.failureUnalignedRepairCount,
299
- failureUnalignedRepairEvents: Array.isArray(quality.failureUnalignedRepairEvents) ? quality.failureUnalignedRepairEvents.slice(0, 20) : [],
300
- postEditRegressionCycleCount: quality.postEditRegressionCycleCount,
301
- postEditRegressionCycleEvents: Array.isArray(quality.postEditRegressionCycleEvents) ? quality.postEditRegressionCycleEvents.slice(0, 20) : [],
302
- postSuccessMutationCount: quality.postSuccessMutationCount,
303
- postSuccessMutationEvents: Array.isArray(quality.postSuccessMutationEvents) ? quality.postSuccessMutationEvents.slice(0, 20) : [],
304
- predictedEditCount: quality.predictedEditCount,
305
- unpredictedEditCount: quality.unpredictedEditCount,
306
- contradictedEditPredictionCount: quality.contradictedEditPredictionCount,
307
- unverifiedEditPredictionCount: quality.unverifiedEditPredictionCount,
308
- decisionObservabilityRisk: quality.decisionObservabilityRisk,
309
- scratchArtifactPermissionDetected: quality.scratchArtifactPermissionDetected,
310
- scratchArtifactEvents: Array.isArray(quality.scratchArtifactEvents) ? quality.scratchArtifactEvents.slice(0, 20) : [],
311
- postEditDiffReview: quality.postEditDiffReview,
312
- diffReviewAfterLastEdit: quality.diffReviewAfterLastEdit,
313
- firstPostEditDiffReviewSeq: quality.firstPostEditDiffReviewSeq,
314
- firstDiffReviewAfterLastEditSeq: quality.firstDiffReviewAfterLastEditSeq,
315
- broadValidationAfterFirstEdit: quality.broadValidationAfterFirstEdit,
316
- passingBroadValidationAfterFirstEdit: quality.passingBroadValidationAfterFirstEdit,
317
- firstBroadValidationAfterFirstEditSeq: quality.firstBroadValidationAfterFirstEditSeq,
318
- lastPostEditVerificationSeq: quality.lastPostEditVerificationSeq,
319
- lastPostEditVerificationStatus: quality.lastPostEditVerificationStatus,
320
- lastPostEditVerificationConclusiveFailure: quality.lastPostEditVerificationConclusiveFailure,
321
- firstConclusiveFailedVerificationSeq: quality.firstConclusiveFailedVerificationSeq,
322
- testEditPermissionDetected: quality.testEditPermissionDetected,
323
- testHarnessEditEvents: Array.isArray(quality.testHarnessEditEvents) ? quality.testHarnessEditEvents.slice(0, 20) : [],
324
- processScore: quality.processScore,
325
- processDefects: Array.isArray(quality.processDefects) ? quality.processDefects.slice(0, 20) : [],
326
- warnings: Array.isArray(quality.warnings) ? quality.warnings.slice(0, 20) : [],
327
- },
328
- };
329
- }
330
-
331
- function compactAgentContextCompilation(compilation) {
332
- if (!compilation || typeof compilation !== 'object' || Array.isArray(compilation)) return undefined;
333
- const metadata = compilation.metadata && typeof compilation.metadata === 'object' && !Array.isArray(compilation.metadata)
334
- ? compilation.metadata
335
- : {};
336
- return {
337
- version: compilation.version,
338
- format: compilation.format,
339
- task: truncate(compilation.task || '', 2000),
340
- context: truncate(compilation.context || '', 5000),
341
- answer: truncate(compilation.answer || '', 2500),
342
- metadata: {
343
- sessionId: metadata.sessionId,
344
- mode: metadata.mode,
345
- provider: metadata.provider,
346
- model: metadata.model,
347
- eventCount: metadata.eventCount,
348
- contextEventCount: metadata.contextEventCount,
349
- verificationStatus: metadata.verificationStatus,
350
- successfulVerificationCount: metadata.successfulVerificationCount,
351
- processScore: metadata.processScore,
352
- usageTotalTokens: metadata.usageTotalTokens,
353
- estimatedCostUsd: metadata.estimatedCostUsd,
354
- changedFiles: Array.isArray(metadata.changedFiles) ? metadata.changedFiles.slice(0, 100) : [],
355
- verificationCommands: Array.isArray(metadata.verificationCommands) ? metadata.verificationCommands.slice(0, 20) : [],
356
- sourceResearchCoverage: metadata.sourceResearchCoverage,
357
- warnings: Array.isArray(metadata.warnings) ? metadata.warnings.slice(0, 20) : [],
358
- },
359
- };
360
- }
361
-
362
- function compactChangeEvaluation(changeEvaluation) {
363
- if (!changeEvaluation || typeof changeEvaluation !== 'object' || Array.isArray(changeEvaluation)) return undefined;
364
- return {
365
- version: changeEvaluation.version,
366
- format: changeEvaluation.format,
367
- status: changeEvaluation.status,
368
- accepted: changeEvaluation.accepted,
369
- reason: truncate(changeEvaluation.reason || '', 500),
370
- editCount: changeEvaluation.editCount,
371
- predictedEditCount: changeEvaluation.predictedEditCount,
372
- unpredictedEditCount: changeEvaluation.unpredictedEditCount,
373
- confirmedPredictionCount: changeEvaluation.confirmedPredictionCount,
374
- contradictedPredictionCount: changeEvaluation.contradictedPredictionCount,
375
- unverifiedPredictionCount: changeEvaluation.unverifiedPredictionCount,
376
- decisionCoveragePercent: changeEvaluation.decisionCoveragePercent,
377
- regressionCycleCount: changeEvaluation.regressionCycleCount,
378
- broadRegressionFailureCount: changeEvaluation.broadRegressionFailureCount,
379
- predictions: Array.isArray(changeEvaluation.predictions) ? changeEvaluation.predictions.slice(0, 20) : [],
380
- unpredictedEdits: Array.isArray(changeEvaluation.unpredictedEdits) ? changeEvaluation.unpredictedEdits.slice(0, 20) : [],
381
- regressionCycles: Array.isArray(changeEvaluation.regressionCycles) ? changeEvaluation.regressionCycles.slice(0, 20) : [],
382
- recommendedAction: truncate(changeEvaluation.recommendedAction || '', 500),
383
- };
384
- }
385
-
386
- function compactSubmissionBundleManifest(manifest) {
387
- if (!manifest || typeof manifest !== 'object' || Array.isArray(manifest)) return undefined;
388
- return {
389
- version: manifest.version,
390
- format: manifest.format,
391
- submissionReady: manifest.submissionReady,
392
- reason: truncate(manifest.reason || '', 500),
393
- officialResultRequired: manifest.officialResultRequired,
394
- missingOfficialFields: Array.isArray(manifest.missingOfficialFields) ? manifest.missingOfficialFields.slice(0, 20) : [],
395
- benchmark: manifest.benchmark,
396
- benchmarkName: manifest.benchmarkName,
397
- sessionId: manifest.sessionId,
398
- provider: manifest.provider,
399
- model: manifest.model,
400
- summaryContainer: manifest.summaryContainer,
401
- artifacts: Array.isArray(manifest.artifacts) ? manifest.artifacts.slice(0, 50) : [],
402
- verification: manifest.verification,
403
- usage: manifest.usage,
404
- process: manifest.process,
405
- leaderboardDraft: manifest.leaderboardDraft,
406
- };
407
- }
408
-
409
- function compactExperienceCard(card) {
410
- if (!card || typeof card !== 'object' || Array.isArray(card)) return undefined;
411
- return {
412
- version: card.version,
413
- replayCheckpoints: Array.isArray(card.replayCheckpoints) ? card.replayCheckpoints.slice(0, 20) : [],
414
- failureSignatures: Array.isArray(card.failureSignatures) ? card.failureSignatures.slice(0, 10) : [],
415
- sourceResearchCoverage: card.sourceResearchCoverage,
416
- taskContract: card.taskContract,
417
- taskAlignment: compactRiskSignalBlock(card.taskAlignment),
418
- specCompliance: compactRiskSignalBlock(card.specCompliance),
419
- rewardHack: compactRiskSignalBlock(card.rewardHack),
420
- longHorizon: compactRiskSignalBlock(card.longHorizon),
421
- proactivity: compactProactivity(card.proactivity),
422
- environmentReconstruction: compactEnvironmentReconstruction(card.environmentReconstruction),
423
- dependencyUpgrade: compactDependencyUpgrade(card.dependencyUpgrade),
424
- decisionObservability: compactDecisionObservability(card.decisionObservability),
425
- validationReliability: compactValidationReliability(card.validationReliability),
426
- contextUtilization: compactContextUtilization(card.contextUtilization),
427
- runEfficiency: compactRunEfficiency(card.runEfficiency),
428
- verificationCommands: Array.isArray(card.verificationCommands) ? card.verificationCommands.slice(0, 20) : [],
429
- changedFiles: Array.isArray(card.changedFiles) ? card.changedFiles.slice(0, 100) : [],
430
- warnings: Array.isArray(card.warnings) ? card.warnings.slice(0, 20) : [],
431
- };
432
- }
433
-
434
- function compactRiskSignalBlock(block) {
435
- if (!block || typeof block !== 'object' || Array.isArray(block)) return undefined;
436
- return {
437
- risk: block.risk,
438
- signalCount: block.signalCount,
439
- signals: Array.isArray(block.signals) ? block.signals.slice(0, 20) : [],
440
- };
441
- }
442
-
443
- function compactProactivity(proactivity) {
444
- if (!proactivity || typeof proactivity !== 'object' || Array.isArray(proactivity)) return undefined;
445
- return {
446
- detected: proactivity.detected,
447
- risk: proactivity.risk,
448
- signalCount: proactivity.signalCount,
449
- signals: Array.isArray(proactivity.signals) ? proactivity.signals.slice(0, 20) : [],
450
- contextContract: proactivity.contextContract,
451
- hiddenIntentEvidence: proactivity.hiddenIntentEvidence,
452
- clarificationEvidence: proactivity.clarificationEvidence,
453
- privacyEvidence: proactivity.privacyEvidence,
454
- completionEvidence: proactivity.completionEvidence,
455
- actionCount: proactivity.actionCount,
456
- };
457
- }
458
-
459
- function compactRunEfficiency(runEfficiency) {
460
- if (!runEfficiency || typeof runEfficiency !== 'object' || Array.isArray(runEfficiency)) return undefined;
461
- return {
462
- toolCallCount: runEfficiency.toolCallCount,
463
- totalToolElapsedMs: runEfficiency.totalToolElapsedMs,
464
- maxToolElapsedMs: runEfficiency.maxToolElapsedMs,
465
- slowToolCallCount: runEfficiency.slowToolCallCount,
466
- usageCallCount: runEfficiency.usageCallCount,
467
- totalTokens: runEfficiency.totalTokens,
468
- estimatedCostUsd: runEfficiency.estimatedCostUsd,
469
- successfulVerificationCount: runEfficiency.successfulVerificationCount,
470
- processScore: runEfficiency.processScore,
471
- processDefectCount: runEfficiency.processDefectCount,
472
- warningCount: runEfficiency.warningCount,
473
- invalidToolActionCount: runEfficiency.invalidToolActionCount,
474
- invalidToolActionPercent: runEfficiency.invalidToolActionPercent,
475
- costEfficiencyRisk: runEfficiency.costEfficiencyRisk,
476
- timeEfficiencyRisk: runEfficiency.timeEfficiencyRisk,
477
- slowToolEvents: Array.isArray(runEfficiency.slowToolEvents) ? runEfficiency.slowToolEvents.slice(0, 12) : [],
478
- };
479
- }
480
-
481
- function compactContextUtilization(contextUtilization) {
482
- if (!contextUtilization || typeof contextUtilization !== 'object' || Array.isArray(contextUtilization)) return undefined;
483
- return {
484
- inspectCount: contextUtilization.inspectCount,
485
- hitCount: contextUtilization.hitCount,
486
- missCount: contextUtilization.missCount,
487
- utilizationPercent: contextUtilization.utilizationPercent,
488
- risk: contextUtilization.risk,
489
- missEvents: Array.isArray(contextUtilization.missEvents) ? contextUtilization.missEvents.slice(0, 12) : [],
490
- };
491
- }
492
-
493
- function compactValidationReliability(validationReliability) {
494
- if (!validationReliability || typeof validationReliability !== 'object' || Array.isArray(validationReliability)) return undefined;
495
- return {
496
- lastEditSeq: validationReliability.lastEditSeq,
497
- finalEditVerificationCount: validationReliability.finalEditVerificationCount,
498
- finalEditPassingVerificationCount: validationReliability.finalEditPassingVerificationCount,
499
- stableValidationAfterLastEdit: validationReliability.stableValidationAfterLastEdit,
500
- broadValidationAfterLastEdit: validationReliability.broadValidationAfterLastEdit,
501
- passingBroadValidationAfterLastEdit: validationReliability.passingBroadValidationAfterLastEdit,
502
- ciValidationAfterLastEdit: validationReliability.ciValidationAfterLastEdit,
503
- passingCiValidationAfterLastEdit: validationReliability.passingCiValidationAfterLastEdit,
504
- postEditRegressionCycleCount: validationReliability.postEditRegressionCycleCount,
505
- lastPostEditVerificationSeq: validationReliability.lastPostEditVerificationSeq,
506
- lastPostEditVerificationStatus: validationReliability.lastPostEditVerificationStatus,
507
- finalVerifierCommands: Array.isArray(validationReliability.finalVerifierCommands) ? validationReliability.finalVerifierCommands.slice(0, 12) : [],
508
- };
509
- }
510
-
511
- function compactDecisionObservability(decisionObservability) {
512
- if (!decisionObservability || typeof decisionObservability !== 'object' || Array.isArray(decisionObservability)) return undefined;
513
- return {
514
- editCount: decisionObservability.editCount,
515
- predictedEditCount: decisionObservability.predictedEditCount,
516
- verifiedPredictionCount: decisionObservability.verifiedPredictionCount,
517
- editPredictions: Array.isArray(decisionObservability.editPredictions) ? decisionObservability.editPredictions.slice(0, 12) : [],
518
- };
519
- }
520
-
521
- function compactEnvironmentReconstruction(environmentReconstruction) {
522
- if (!environmentReconstruction || typeof environmentReconstruction !== 'object' || Array.isArray(environmentReconstruction)) return undefined;
523
- return {
524
- setupFailureCount: environmentReconstruction.setupFailureCount,
525
- unresolvedSetupFailureCount: environmentReconstruction.unresolvedSetupFailureCount,
526
- setupCount: environmentReconstruction.setupCount,
527
- successfulSetupCount: environmentReconstruction.successfulSetupCount,
528
- setupEvents: Array.isArray(environmentReconstruction.setupEvents) ? environmentReconstruction.setupEvents.slice(0, 12) : [],
529
- setupFailures: Array.isArray(environmentReconstruction.setupFailures) ? environmentReconstruction.setupFailures.slice(0, 12) : [],
530
- unresolvedSetupFailures: Array.isArray(environmentReconstruction.unresolvedSetupFailures) ? environmentReconstruction.unresolvedSetupFailures.slice(0, 12) : [],
531
- };
532
- }
533
-
534
- function compactDependencyUpgrade(dependencyUpgrade) {
535
- if (!dependencyUpgrade || typeof dependencyUpgrade !== 'object' || Array.isArray(dependencyUpgrade)) return undefined;
536
- return {
537
- manifestEditCount: dependencyUpgrade.manifestEditCount,
538
- lockfileEditCount: dependencyUpgrade.lockfileEditCount,
539
- manifestEdits: Array.isArray(dependencyUpgrade.manifestEdits) ? dependencyUpgrade.manifestEdits.slice(0, 12) : [],
540
- lockfileEdits: Array.isArray(dependencyUpgrade.lockfileEdits) ? dependencyUpgrade.lockfileEdits.slice(0, 12) : [],
541
- setupAfterManifestEdit: dependencyUpgrade.setupAfterManifestEdit,
542
- passingSetupAfterManifestEdit: dependencyUpgrade.passingSetupAfterManifestEdit,
543
- validationAfterManifestEdit: dependencyUpgrade.validationAfterManifestEdit,
544
- passingValidationAfterManifestEdit: dependencyUpgrade.passingValidationAfterManifestEdit,
545
- firstSetupAfterManifestEditSeq: dependencyUpgrade.firstSetupAfterManifestEditSeq,
546
- firstValidationAfterManifestEditSeq: dependencyUpgrade.firstValidationAfterManifestEditSeq,
547
- };
548
- }
549
-
550
- function collectGitArtifactRefs(workdir, artifactRoot) {
551
- const refs = [];
552
- if (!workdir || !existsSync(workdir)) return refs;
553
-
554
- const gitCheck = spawnSync('git', ['-C', workdir, 'rev-parse', '--is-inside-work-tree'], {
555
- encoding: 'utf8',
556
- timeout: 5000,
557
- maxBuffer: 64 * 1024,
558
- });
559
- if (gitCheck.status !== 0 || !String(gitCheck.stdout || '').trim().startsWith('true')) {
560
- return refs;
561
- }
562
-
563
- const diff = buildWorktreePatch(workdir);
564
- if (diff.trim()) {
565
- const patchPath = join(artifactRoot, 'cawdex.patch');
566
- writeFileSync(patchPath, redact(diff), 'utf8');
567
- refs.push({
568
- kind: 'patch',
569
- path: patchPath,
570
- contentType: 'text/x-diff',
571
- description: 'Redacted git diff after Cawdex run.',
572
- });
573
- }
574
-
575
- const status = spawnSync('git', ['-C', workdir, 'status', '--short'], {
576
- encoding: 'utf8',
577
- timeout: 5000,
578
- maxBuffer: 512 * 1024,
579
- });
580
- if (status.stdout && status.stdout.trim()) {
581
- const statusPath = join(artifactRoot, 'git-status.txt');
582
- writeFileSync(statusPath, redact(status.stdout), 'utf8');
583
- refs.push({
584
- kind: 'git-status',
585
- path: statusPath,
586
- contentType: 'text/plain',
587
- description: 'Redacted git status after Cawdex run.',
588
- });
589
- }
590
-
591
- return refs;
592
- }
593
-
594
- function runGit(workdir, args, options = {}) {
595
- const result = spawnSync('git', ['-C', workdir, ...args], {
596
- encoding: 'utf8',
597
- timeout: options.timeout || 5000,
598
- maxBuffer: options.maxBuffer || 1024 * 1024,
599
- });
600
- if (result.error) return '';
601
- if (result.status !== 0 && !(options.allowDiffExit && result.status === 1)) return '';
602
- return result.stdout || '';
603
- }
604
-
605
- function buildWorktreePatch(workdir) {
606
- const parts = [
607
- runGit(workdir, ['diff', '--binary', '--no-ext-diff'], { timeout: 10000, maxBuffer: 20 * 1024 * 1024 }),
608
- runGit(workdir, ['diff', '--cached', '--binary', '--no-ext-diff'], { timeout: 10000, maxBuffer: 20 * 1024 * 1024 }),
609
- ...collectUntrackedDiffs(workdir),
610
- ].map((part) => part.trim()).filter(Boolean);
611
- return parts.join('\n\n') + (parts.length ? '\n' : '');
612
- }
613
-
614
- function collectUntrackedDiffs(workdir) {
615
- const raw = runGit(workdir, ['ls-files', '--others', '--exclude-standard', '-z']);
616
- const files = raw.split('\0').map((file) => file.trim()).filter(Boolean).slice(0, 80);
617
- return files
618
- .map((file) => runGit(workdir, ['diff', '--no-index', '--binary', '--no-ext-diff', '--', '/dev/null', file], {
619
- timeout: 5000,
620
- maxBuffer: 5 * 1024 * 1024,
621
- allowDiffExit: true,
622
- }))
623
- .filter((diff) => diff.trim());
624
- }
625
-
626
- let payload;
627
- try {
628
- payload = readInput();
629
- } catch (err) {
630
- fail('invalid_adapter', `Could not read adapter input: ${err?.message || err}`);
631
- process.exit(0);
632
- }
633
-
634
- if (payload.mode !== 'task') {
635
- fail('unsupported_capability', 'Cawdex KBench adapter currently supports task mode only.');
636
- process.exit(0);
637
- }
638
-
639
- const task = payload.task || {};
640
- const config = payload.config || {};
641
- const env = payload.env || task.env || {};
642
- const taskEnv = task.env || {};
643
- const benchmark = task.benchmark || 'swe';
644
- const instruction = String(task.instruction || '').trim();
645
- const profile = profileForBenchmark(benchmark);
646
- const prompt = `/benchmark ${profile} ${instruction}`;
647
- const workdir = config.workDir || env.workdir || env.repoPath || taskEnv.workdir || taskEnv.repoPath || process.cwd();
648
- const artifactRoot = config.storeDir
649
- || process.env.CAWDEX_KBENCH_ARTIFACT_DIR
650
- || (() => {
651
- const dir = join(tmpdir(), `cawdex-kbench-${process.pid}-${Date.now()}`);
652
- mkdirSync(dir, { recursive: true });
653
- return dir;
654
- })();
655
- mkdirSync(artifactRoot, { recursive: true });
656
-
657
- const stdoutPath = join(artifactRoot, 'cawdex.stdout.txt');
658
- const stderrPath = join(artifactRoot, 'cawdex.stderr.txt');
659
- const instructionPath = join(artifactRoot, 'instruction.txt');
660
- const traceDir = join(artifactRoot, 'cawdex-trace');
661
- mkdirSync(dirname(stdoutPath), { recursive: true });
662
- writeFileSync(instructionPath, redact(instruction), 'utf8');
663
-
664
- const commandParts = splitCommand(process.env.CAWDEX_KBENCH_COMMAND || process.env.CAWDEX_KBENCH_COMMAND || 'cawdex');
665
- if (!commandParts.length) {
666
- fail('invalid_adapter', 'CAWDEX_KBENCH_COMMAND resolved to an empty command.');
667
- process.exit(0);
668
- }
669
-
670
- const [command, ...prefixArgs] = commandParts;
671
- const args = [
672
- ...prefixArgs,
673
- '--prompt', prompt,
674
- '--perm', process.env.CAWDEX_KBENCH_PERMISSION || 'yolo',
675
- '--output-format', 'text',
676
- '--benchmark-trace-dir', traceDir,
677
- ];
678
- if (config.modelName) args.push('--model', String(config.modelName));
679
- if (config.temperature !== undefined) args.push('--temperature', String(config.temperature));
680
- if (config.baseUrl) args.push('--base-url', String(config.baseUrl));
681
- if (config.apiKeyEnv) args.push('--api-key-env', String(config.apiKeyEnv));
682
- if (process.env.CAWDEX_KBENCH_EXTRA_ARGS) {
683
- args.push(...splitCommand(process.env.CAWDEX_KBENCH_EXTRA_ARGS));
684
- }
685
-
686
- const childEnv = {
687
- ...process.env,
688
- CAWDEX_BENCHMARK_TRACE: '1',
689
- CAWDEX_BENCHMARK_TRACE_DIR: traceDir,
690
- CAWDEX_BASH_TIMEOUT_MS: process.env.CAWDEX_BASH_TIMEOUT_MS || '300000',
691
- };
692
- for (const [key, value] of Object.entries(env.envVars || {})) {
693
- if (typeof value === 'string') childEnv[key] = value;
694
- }
695
- for (const [key, value] of Object.entries(taskEnv.envVars || {})) {
696
- if (typeof value === 'string') childEnv[key] = value;
697
- }
698
-
699
- const result = spawnSync(command, args, {
700
- cwd: existsSync(workdir) ? workdir : process.cwd(),
701
- env: childEnv,
702
- encoding: 'utf8',
703
- timeout: typeof config.timeoutMs === 'number' && config.timeoutMs > 0 ? config.timeoutMs : undefined,
704
- maxBuffer: 20 * 1024 * 1024,
705
- });
706
-
707
- const stdout = result.stdout || '';
708
- const stderr = result.stderr || '';
709
- writeFileSync(stdoutPath, redact(stdout), 'utf8');
710
- writeFileSync(stderrPath, redact(stderr), 'utf8');
711
- const elapsedMs = Date.now() - startedAt;
712
- const timedOut = result.error?.code === 'ETIMEDOUT' || result.signal === 'SIGTERM';
713
- const exitCode = typeof result.status === 'number' ? result.status : (timedOut ? 124 : 1);
714
- const ok = exitCode === 0;
715
- const traceRefs = collectTraceRefs(traceDir);
716
- const traceSummary = compactTraceSummary(readLatestTraceSummary(traceDir));
717
- const workdirUsed = existsSync(workdir) ? workdir : process.cwd();
718
- const gitRefs = collectGitArtifactRefs(workdirUsed, artifactRoot);
719
- const stdoutLines = stdout.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
720
- const finalText = stdoutLines.at(-1) || stdout.trim() || (ok ? 'Cawdex completed.' : 'Cawdex produced no stdout.');
721
- const artifacts = [
722
- { kind: 'instruction', path: instructionPath, contentType: 'text/plain', description: 'KBench task instruction passed to Cawdex.' },
723
- { kind: 'stdout', path: stdoutPath, contentType: 'text/plain', description: 'Cawdex stdout.' },
724
- { kind: 'stderr', path: stderrPath, contentType: 'text/plain', description: 'Cawdex stderr.' },
725
- ...gitRefs,
726
- ...traceRefs,
727
- ];
728
-
729
- const output = {
730
- ok,
731
- status: ok ? 'ok' : (timedOut ? 'timeout' : 'agent_error'),
732
- failureKind: ok ? undefined : (timedOut ? 'timeout' : `exit_${exitCode}`),
733
- finalText: truncate(finalText, 4000),
734
- elapsedMs,
735
- artifacts,
736
- trace: traceRefs.length ? { native: traceRefs } : undefined,
737
- benchmarkResult: {
738
- mode: 'cawdex-kbench',
739
- benchmark,
740
- profile,
741
- exitCode,
742
- workdir: workdirUsed,
743
- traceSummary,
744
- verificationEvidence: traceSummary?.verificationEvidence,
745
- experienceCard: traceSummary?.experienceCard,
746
- usage: traceSummary?.usage,
747
- },
748
- error: ok ? undefined : {
749
- message: truncate(stderr.trim() || stdout.trim() || result.error?.message || `Cawdex exited with code ${exitCode}`, 2000),
750
- },
751
- };
752
-
753
- emit(output);
1
+ #!/usr/bin/env node
2
+ import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync } from 'node:fs';
3
+ import { tmpdir } from 'node:os';
4
+ import { dirname, join } from 'node:path';
5
+ import { spawnSync } from 'node:child_process';
6
+
7
+ const startedAt = Date.now();
8
+
9
+ function emit(output) {
10
+ process.stdout.write(JSON.stringify(output));
11
+ }
12
+
13
+ function fail(status, message, extra = {}) {
14
+ emit({
15
+ ok: false,
16
+ status,
17
+ failureKind: status,
18
+ finalText: message,
19
+ elapsedMs: Date.now() - startedAt,
20
+ artifacts: [],
21
+ error: { message },
22
+ ...extra,
23
+ });
24
+ }
25
+
26
+ function readInput() {
27
+ const inputPath = process.env.KBENCH_ADAPTER_INPUT;
28
+ const raw = inputPath ? readFileSync(inputPath, 'utf8') : readFileSync(0, 'utf8');
29
+ return JSON.parse(raw);
30
+ }
31
+
32
+ function redact(text) {
33
+ return String(text || '')
34
+ .replace(/sk-or-v1-[A-Za-z0-9_-]+/g, 'sk-or-v1-[REDACTED]')
35
+ .replace(/sk-[A-Za-z0-9_-]{16,}/g, 'sk-[REDACTED]')
36
+ .replace(/hf_[A-Za-z0-9]{16,}/g, 'hf_[REDACTED]')
37
+ .replace(/KGAT_[A-Za-z0-9]{16,}/g, 'KGAT_[REDACTED]')
38
+ .replace(/npm_[A-Za-z0-9]{16,}/g, 'npm_[REDACTED]');
39
+ }
40
+
41
+ function truncate(text, max) {
42
+ const safe = redact(text);
43
+ if (safe.length <= max) return safe;
44
+ return `${safe.slice(0, max - 80)}\n...[truncated ${safe.length - (max - 80)} chars]`;
45
+ }
46
+
47
+ function splitCommand(command) {
48
+ const parts = [];
49
+ let cur = '';
50
+ let quote = null;
51
+ let escaped = false;
52
+ for (const ch of command.trim()) {
53
+ if (escaped) {
54
+ cur += ch;
55
+ escaped = false;
56
+ continue;
57
+ }
58
+ if (ch === '\\' && quote) {
59
+ escaped = true;
60
+ continue;
61
+ }
62
+ if (quote) {
63
+ if (ch === quote) quote = null;
64
+ else cur += ch;
65
+ continue;
66
+ }
67
+ if (ch === '"' || ch === "'") {
68
+ quote = ch;
69
+ continue;
70
+ }
71
+ if (/\s/.test(ch)) {
72
+ if (cur) {
73
+ parts.push(cur);
74
+ cur = '';
75
+ }
76
+ continue;
77
+ }
78
+ cur += ch;
79
+ }
80
+ if (cur) parts.push(cur);
81
+ return parts;
82
+ }
83
+
84
+ function profileForBenchmark(benchmark) {
85
+ const slug = String(benchmark || '').toLowerCase().replace(/[^a-z0-9]+/g, '');
86
+ if (slug === 'swe' || slug === 'swebench') return 'swe-bench';
87
+ if (slug === 'tb2' || slug === 'terminalbench') return 'terminal-bench';
88
+ if (slug === 'terminalworld' || slug === 'terminalworldbench' || slug === 'tw' || slug === 'tworld') return 'terminalworld';
89
+ if (slug === 'swechain' || slug === 'chain' || slug === 'upgrade') return 'swe-chain';
90
+ if (slug === 'swecycle' || slug === 'swecyclebench' || slug === 'fullcycle' || slug === 'swejudge') return 'swe-cycle';
91
+ if (slug === 'sweci' || slug === 'swecibench') return 'swe-ci';
92
+ if (slug === 'swepr' || slug === 'sweprbench' || slug === 'prbench' || slug === 'prreview' || slug === 'pullrequestreview' || slug === 'codereviewbench') return 'swe-prbench';
93
+ if (slug === 'tml' || slug === 'tmlbench' || slug === 'tabularml' || slug === 'kaggleml' || slug === 'kagglebench' || slug === 'datascience') return 'tml-bench';
94
+ if (slug === 'pi' || slug === 'pibench' || slug === 'proactive' || slug === 'proactiveassistant' || slug === 'personalassistant' || slug === 'hiddenintent') return 'pi-bench';
95
+ if (slug === 'cirepair' || slug === 'cirepairbench' || slug === 'ci') return 'ci-repair';
96
+ if (slug === 'wildclaw' || slug === 'wildclawbench' || slug === 'wcbench') return 'wildclaw';
97
+ if (slug === 'arc' || slug === 'arcagi' || slug === 'arcagi3' || slug === 'arcprize') return 'arc-agi';
98
+ if (slug === 'spec' || slug === 'specbench' || slug === 'speccompliance') return 'specbench';
99
+ if (slug === 'rhb' || slug === 'rewardhack' || slug === 'rewardhacking' || slug === 'rewardhackingagents') return 'reward-hacking';
100
+ if (slug === 'roadmap' || slug === 'roadmapbench' || slug === 'longhorizon' || slug === 'versionupgrade') return 'roadmapbench';
101
+ if (slug === 'saas' || slug === 'saasbench' || slug === 'enterprise') return 'saasbench';
102
+ if (slug === 'mobile' || slug === 'swebenchmobile' || slug === 'swemobile' || slug === 'ios') return 'swe-bench-mobile';
103
+ if (slug === 'webdev' || slug === 'webdevbench' || slug === 'swewebdev' || slug === 'swewebdevbench' || slug === 'vibecoding') return 'webdevbench';
104
+ if (slug === 'app' || slug === 'appworld' || slug === 'appworldbench') return 'appworld';
105
+ if (slug === 'browsecomp' || slug === 'browsecompplus' || slug === 'deepresearch' || slug === 'webresearch') return 'browsecomp';
106
+ if (slug === 'tau' || slug === 'tau2' || slug === 'taubench' || slug === 'taubench2' || slug.startsWith('tau2') || slug.startsWith('taubench')) return 'tau2';
107
+ return 'generic';
108
+ }
109
+
110
+ function collectTraceRefs(traceDir) {
111
+ const refs = [];
112
+ if (!traceDir || !existsSync(traceDir)) return refs;
113
+ const stack = [traceDir];
114
+ while (stack.length) {
115
+ const dir = stack.pop();
116
+ for (const entry of readdirSync(dir, { withFileTypes: true })) {
117
+ const full = join(dir, entry.name);
118
+ if (entry.isDirectory()) {
119
+ stack.push(full);
120
+ } else if (entry.name === 'summary.json' || entry.name === 'trace.jsonl') {
121
+ refs.push({
122
+ kind: entry.name === 'trace.jsonl' ? 'cawdex-tool-trace' : 'cawdex-summary',
123
+ path: full,
124
+ contentType: entry.name.endsWith('.jsonl') ? 'application/jsonl' : 'application/json',
125
+ description: `Cawdex ${entry.name}`,
126
+ });
127
+ }
128
+ }
129
+ }
130
+ return refs;
131
+ }
132
+
133
+ function readLatestTraceSummary(traceDir) {
134
+ if (!traceDir || !existsSync(traceDir)) return null;
135
+ const summaries = [];
136
+ const stack = [traceDir];
137
+ while (stack.length) {
138
+ const dir = stack.pop();
139
+ for (const entry of readdirSync(dir, { withFileTypes: true })) {
140
+ const full = join(dir, entry.name);
141
+ if (entry.isDirectory()) {
142
+ stack.push(full);
143
+ } else if (entry.name === 'summary.json') {
144
+ try {
145
+ const summary = JSON.parse(readFileSync(full, 'utf8'));
146
+ const endedAtMs = Number.isFinite(Date.parse(summary.endedAt)) ? Date.parse(summary.endedAt) : 0;
147
+ const mtimeMs = statSync(full).mtimeMs;
148
+ summaries.push({ path: full, summary, sortKey: endedAtMs || mtimeMs });
149
+ } catch {
150
+ // Ignore malformed or partially-written trace summaries.
151
+ }
152
+ }
153
+ }
154
+ }
155
+ summaries.sort((a, b) => b.sortKey - a.sortKey);
156
+ return summaries[0] || null;
157
+ }
158
+
159
+ function compactTraceSummary(traceSummary) {
160
+ if (!traceSummary) return undefined;
161
+ const summary = traceSummary.summary || {};
162
+ const quality = summary.trajectoryQuality || {};
163
+ const experienceCard = compactExperienceCard(summary.experienceCard);
164
+ return {
165
+ path: traceSummary.path,
166
+ verificationCount: summary.verificationCount,
167
+ verificationCommands: Array.isArray(summary.verificationCommands) ? summary.verificationCommands.slice(0, 20) : [],
168
+ verificationEvidence: summary.verificationEvidence,
169
+ finalAnswerEvidence: summary.finalAnswerEvidence,
170
+ usage: summary.usage,
171
+ experienceCard,
172
+ agentContextCompilation: compactAgentContextCompilation(summary.agentContextCompilation),
173
+ changeEvaluation: compactChangeEvaluation(summary.changeEvaluation),
174
+ submissionBundleManifest: compactSubmissionBundleManifest(summary.submissionBundleManifest),
175
+ changedFiles: Array.isArray(summary.changedFiles) ? summary.changedFiles.slice(0, 100) : [],
176
+ worktreeChangedFiles: Array.isArray(summary.worktreeChangedFiles) ? summary.worktreeChangedFiles.slice(0, 100) : [],
177
+ artifacts: Array.isArray(summary.artifacts) ? summary.artifacts.slice(0, 20) : [],
178
+ trajectoryQuality: {
179
+ benchmarkContextUsed: quality.benchmarkContextUsed,
180
+ usageCallCount: quality.usageCallCount,
181
+ usageTotalTokens: quality.usageTotalTokens,
182
+ usageEstimatedCostUsd: quality.usageEstimatedCostUsd,
183
+ costEfficiencyRisk: quality.costEfficiencyRisk,
184
+ totalToolElapsedMs: quality.totalToolElapsedMs,
185
+ maxToolElapsedMs: quality.maxToolElapsedMs,
186
+ slowToolCallCount: quality.slowToolCallCount,
187
+ slowToolEvents: Array.isArray(quality.slowToolEvents) ? quality.slowToolEvents.slice(0, 20) : [],
188
+ timeEfficiencyRisk: quality.timeEfficiencyRisk,
189
+ invalidToolActionCount: quality.invalidToolActionCount,
190
+ invalidToolActionPercent: quality.invalidToolActionPercent,
191
+ invalidToolActionEvents: Array.isArray(quality.invalidToolActionEvents) ? quality.invalidToolActionEvents.slice(0, 20) : [],
192
+ localizationBeforeFirstEdit: quality.localizationBeforeFirstEdit,
193
+ failingReproductionBeforeFirstEdit: quality.failingReproductionBeforeFirstEdit,
194
+ passingValidationAfterFirstEdit: quality.passingValidationAfterFirstEdit,
195
+ validationAfterLastEdit: quality.validationAfterLastEdit,
196
+ passingValidationAfterLastEdit: quality.passingValidationAfterLastEdit,
197
+ finalEditVerificationCount: quality.finalEditVerificationCount,
198
+ finalEditPassingVerificationCount: quality.finalEditPassingVerificationCount,
199
+ stableValidationAfterLastEdit: quality.stableValidationAfterLastEdit,
200
+ broadValidationAfterLastEdit: quality.broadValidationAfterLastEdit,
201
+ passingBroadValidationAfterLastEdit: quality.passingBroadValidationAfterLastEdit,
202
+ successfulVerificationCount: quality.successfulVerificationCount,
203
+ failedVerificationCount: quality.failedVerificationCount,
204
+ incompleteVerifierCount: quality.incompleteVerifierCount,
205
+ incompleteVerifierEvents: Array.isArray(quality.incompleteVerifierEvents) ? quality.incompleteVerifierEvents.slice(0, 20) : [],
206
+ inconclusiveVerifierEvents: Array.isArray(quality.inconclusiveVerifierEvents) ? quality.inconclusiveVerifierEvents.slice(0, 20) : [],
207
+ environmentSetupFailureCount: quality.environmentSetupFailureCount,
208
+ environmentSetupFailureEvents: Array.isArray(quality.environmentSetupFailureEvents) ? quality.environmentSetupFailureEvents.slice(0, 20) : [],
209
+ unresolvedEnvironmentSetupFailureCount: quality.unresolvedEnvironmentSetupFailureCount,
210
+ unresolvedEnvironmentSetupFailureEvents: Array.isArray(quality.unresolvedEnvironmentSetupFailureEvents) ? quality.unresolvedEnvironmentSetupFailureEvents.slice(0, 20) : [],
211
+ environmentSetupCount: quality.environmentSetupCount,
212
+ successfulEnvironmentSetupCount: quality.successfulEnvironmentSetupCount,
213
+ environmentSetupEvents: Array.isArray(quality.environmentSetupEvents) ? quality.environmentSetupEvents.slice(0, 20) : [],
214
+ dependencyManifestEditCount: quality.dependencyManifestEditCount,
215
+ dependencyLockfileEditCount: quality.dependencyLockfileEditCount,
216
+ dependencyManifestEditEvents: Array.isArray(quality.dependencyManifestEditEvents) ? quality.dependencyManifestEditEvents.slice(0, 20) : [],
217
+ dependencyLockfileEditEvents: Array.isArray(quality.dependencyLockfileEditEvents) ? quality.dependencyLockfileEditEvents.slice(0, 20) : [],
218
+ dependencySetupAfterManifestEdit: quality.dependencySetupAfterManifestEdit,
219
+ passingDependencySetupAfterManifestEdit: quality.passingDependencySetupAfterManifestEdit,
220
+ dependencyValidationAfterManifestEdit: quality.dependencyValidationAfterManifestEdit,
221
+ passingDependencyValidationAfterManifestEdit: quality.passingDependencyValidationAfterManifestEdit,
222
+ firstDependencySetupAfterManifestEditSeq: quality.firstDependencySetupAfterManifestEditSeq,
223
+ firstDependencyValidationAfterManifestEditSeq: quality.firstDependencyValidationAfterManifestEditSeq,
224
+ skillViewCount: quality.skillViewCount,
225
+ skillViewEvents: Array.isArray(quality.skillViewEvents) ? quality.skillViewEvents.slice(0, 20) : [],
226
+ skillNames: Array.isArray(quality.skillNames) ? quality.skillNames.slice(0, 20) : [],
227
+ skillLoadedBeforeLocalContext: quality.skillLoadedBeforeLocalContext,
228
+ excessiveSkillViewCount: quality.excessiveSkillViewCount,
229
+ ciWorkflowCommandCount: quality.ciWorkflowCommandCount,
230
+ ciVerifierCommands: Array.isArray(quality.ciVerifierCommands) ? quality.ciVerifierCommands.slice(0, 20) : [],
231
+ ciValidationAfterFirstEdit: quality.ciValidationAfterFirstEdit,
232
+ passingCiValidationAfterFirstEdit: quality.passingCiValidationAfterFirstEdit,
233
+ ciValidationAfterLastEdit: quality.ciValidationAfterLastEdit,
234
+ passingCiValidationAfterLastEdit: quality.passingCiValidationAfterLastEdit,
235
+ firstCiValidationAfterFirstEditSeq: quality.firstCiValidationAfterFirstEditSeq,
236
+ sourceResearchCoverage: quality.sourceResearchCoverage,
237
+ taskContractSignalCount: quality.taskContractSignalCount,
238
+ taskContractChecklistUsed: quality.taskContractChecklistUsed,
239
+ taskContractChecklistAfterContext: quality.taskContractChecklistAfterContext,
240
+ taskContractChecklistComplete: quality.taskContractChecklistComplete,
241
+ latestTodoSeq: quality.latestTodoSeq,
242
+ todoIncompleteCount: quality.todoIncompleteCount,
243
+ todoIncompleteItems: Array.isArray(quality.todoIncompleteItems) ? quality.todoIncompleteItems.slice(0, 20) : [],
244
+ taskAlignmentRisk: quality.taskAlignmentRisk,
245
+ taskAlignmentSignalCount: quality.taskAlignmentSignalCount,
246
+ taskAlignmentSignals: Array.isArray(quality.taskAlignmentSignals) ? quality.taskAlignmentSignals.slice(0, 20) : [],
247
+ specComplianceRisk: quality.specComplianceRisk,
248
+ specComplianceSignalCount: quality.specComplianceSignalCount,
249
+ specComplianceSignals: Array.isArray(quality.specComplianceSignals) ? quality.specComplianceSignals.slice(0, 20) : [],
250
+ rewardHackRisk: quality.rewardHackRisk,
251
+ rewardHackSignalCount: quality.rewardHackSignalCount,
252
+ rewardHackSignals: Array.isArray(quality.rewardHackSignals) ? quality.rewardHackSignals.slice(0, 20) : [],
253
+ longHorizonRisk: quality.longHorizonRisk,
254
+ longHorizonSignalCount: quality.longHorizonSignalCount,
255
+ longHorizonSignals: Array.isArray(quality.longHorizonSignals) ? quality.longHorizonSignals.slice(0, 20) : [],
256
+ proactivityDetected: quality.proactivityDetected,
257
+ proactivityRisk: quality.proactivityRisk,
258
+ proactivitySignalCount: quality.proactivitySignalCount,
259
+ proactivitySignals: Array.isArray(quality.proactivitySignals) ? quality.proactivitySignals.slice(0, 20) : [],
260
+ proactivityContextContract: quality.proactivityContextContract,
261
+ proactivityHiddenIntentEvidence: quality.proactivityHiddenIntentEvidence,
262
+ proactivityClarificationEvidence: quality.proactivityClarificationEvidence,
263
+ proactivityPrivacyEvidence: quality.proactivityPrivacyEvidence,
264
+ proactivityCompletionEvidence: quality.proactivityCompletionEvidence,
265
+ proactivityActionCount: quality.proactivityActionCount,
266
+ noEditContractDetected: quality.noEditContractDetected,
267
+ editAfterNoEditContract: quality.editAfterNoEditContract,
268
+ lastEditSeq: quality.lastEditSeq,
269
+ editTargetCount: quality.editTargetCount,
270
+ localizedEditTargetCount: quality.localizedEditTargetCount,
271
+ unlocalizedEditTargetEvents: Array.isArray(quality.unlocalizedEditTargetEvents) ? quality.unlocalizedEditTargetEvents.slice(0, 20) : [],
272
+ contextUtilizationInspectCount: quality.contextUtilizationInspectCount,
273
+ contextUtilizationHitCount: quality.contextUtilizationHitCount,
274
+ contextUtilizationMissCount: quality.contextUtilizationMissCount,
275
+ contextUtilizationPercent: quality.contextUtilizationPercent,
276
+ contextUtilizationRisk: quality.contextUtilizationRisk,
277
+ contextUtilizationMissEvents: Array.isArray(quality.contextUtilizationMissEvents) ? quality.contextUtilizationMissEvents.slice(0, 20) : [],
278
+ preEditContextInspectCount: quality.preEditContextInspectCount,
279
+ preEditContextHitCount: quality.preEditContextHitCount,
280
+ preEditContextMissCount: quality.preEditContextMissCount,
281
+ preEditContextUtilizationPercent: quality.preEditContextUtilizationPercent,
282
+ contextBloatRisk: quality.contextBloatRisk,
283
+ contextBloatEventCount: quality.contextBloatEventCount,
284
+ contextBloatEvents: Array.isArray(quality.contextBloatEvents) ? quality.contextBloatEvents.slice(0, 20) : [],
285
+ evidenceGroundingRisk: quality.evidenceGroundingRisk,
286
+ evidenceGroundingEventCount: quality.evidenceGroundingEventCount,
287
+ evidenceGroundingEvents: Array.isArray(quality.evidenceGroundingEvents) ? quality.evidenceGroundingEvents.slice(0, 20) : [],
288
+ broadEditContractDetected: quality.broadEditContractDetected,
289
+ largeEditSurfaceTargetCount: quality.largeEditSurfaceTargetCount,
290
+ largeEditSurfaceTargets: Array.isArray(quality.largeEditSurfaceTargets) ? quality.largeEditSurfaceTargets.slice(0, 40) : [],
291
+ redundantToolCallCount: quality.redundantToolCallCount,
292
+ redundantToolCallEvents: Array.isArray(quality.redundantToolCallEvents) ? quality.redundantToolCallEvents.slice(0, 20) : [],
293
+ redundantVerifierCount: quality.redundantVerifierCount,
294
+ redundantVerifierEvents: Array.isArray(quality.redundantVerifierEvents) ? quality.redundantVerifierEvents.slice(0, 20) : [],
295
+ blindRepairCount: quality.blindRepairCount,
296
+ blindRepairEvents: Array.isArray(quality.blindRepairEvents) ? quality.blindRepairEvents.slice(0, 20) : [],
297
+ failureAlignedRepairCount: quality.failureAlignedRepairCount,
298
+ failureUnalignedRepairCount: quality.failureUnalignedRepairCount,
299
+ failureUnalignedRepairEvents: Array.isArray(quality.failureUnalignedRepairEvents) ? quality.failureUnalignedRepairEvents.slice(0, 20) : [],
300
+ postEditRegressionCycleCount: quality.postEditRegressionCycleCount,
301
+ postEditRegressionCycleEvents: Array.isArray(quality.postEditRegressionCycleEvents) ? quality.postEditRegressionCycleEvents.slice(0, 20) : [],
302
+ postSuccessMutationCount: quality.postSuccessMutationCount,
303
+ postSuccessMutationEvents: Array.isArray(quality.postSuccessMutationEvents) ? quality.postSuccessMutationEvents.slice(0, 20) : [],
304
+ predictedEditCount: quality.predictedEditCount,
305
+ unpredictedEditCount: quality.unpredictedEditCount,
306
+ contradictedEditPredictionCount: quality.contradictedEditPredictionCount,
307
+ unverifiedEditPredictionCount: quality.unverifiedEditPredictionCount,
308
+ decisionObservabilityRisk: quality.decisionObservabilityRisk,
309
+ scratchArtifactPermissionDetected: quality.scratchArtifactPermissionDetected,
310
+ scratchArtifactEvents: Array.isArray(quality.scratchArtifactEvents) ? quality.scratchArtifactEvents.slice(0, 20) : [],
311
+ postEditDiffReview: quality.postEditDiffReview,
312
+ diffReviewAfterLastEdit: quality.diffReviewAfterLastEdit,
313
+ firstPostEditDiffReviewSeq: quality.firstPostEditDiffReviewSeq,
314
+ firstDiffReviewAfterLastEditSeq: quality.firstDiffReviewAfterLastEditSeq,
315
+ broadValidationAfterFirstEdit: quality.broadValidationAfterFirstEdit,
316
+ passingBroadValidationAfterFirstEdit: quality.passingBroadValidationAfterFirstEdit,
317
+ firstBroadValidationAfterFirstEditSeq: quality.firstBroadValidationAfterFirstEditSeq,
318
+ lastPostEditVerificationSeq: quality.lastPostEditVerificationSeq,
319
+ lastPostEditVerificationStatus: quality.lastPostEditVerificationStatus,
320
+ lastPostEditVerificationConclusiveFailure: quality.lastPostEditVerificationConclusiveFailure,
321
+ firstConclusiveFailedVerificationSeq: quality.firstConclusiveFailedVerificationSeq,
322
+ testEditPermissionDetected: quality.testEditPermissionDetected,
323
+ testHarnessEditEvents: Array.isArray(quality.testHarnessEditEvents) ? quality.testHarnessEditEvents.slice(0, 20) : [],
324
+ processScore: quality.processScore,
325
+ processDefects: Array.isArray(quality.processDefects) ? quality.processDefects.slice(0, 20) : [],
326
+ warnings: Array.isArray(quality.warnings) ? quality.warnings.slice(0, 20) : [],
327
+ },
328
+ };
329
+ }
330
+
331
+ function compactAgentContextCompilation(compilation) {
332
+ if (!compilation || typeof compilation !== 'object' || Array.isArray(compilation)) return undefined;
333
+ const metadata = compilation.metadata && typeof compilation.metadata === 'object' && !Array.isArray(compilation.metadata)
334
+ ? compilation.metadata
335
+ : {};
336
+ return {
337
+ version: compilation.version,
338
+ format: compilation.format,
339
+ task: truncate(compilation.task || '', 2000),
340
+ context: truncate(compilation.context || '', 5000),
341
+ answer: truncate(compilation.answer || '', 2500),
342
+ metadata: {
343
+ sessionId: metadata.sessionId,
344
+ mode: metadata.mode,
345
+ provider: metadata.provider,
346
+ model: metadata.model,
347
+ eventCount: metadata.eventCount,
348
+ contextEventCount: metadata.contextEventCount,
349
+ verificationStatus: metadata.verificationStatus,
350
+ successfulVerificationCount: metadata.successfulVerificationCount,
351
+ processScore: metadata.processScore,
352
+ usageTotalTokens: metadata.usageTotalTokens,
353
+ estimatedCostUsd: metadata.estimatedCostUsd,
354
+ changedFiles: Array.isArray(metadata.changedFiles) ? metadata.changedFiles.slice(0, 100) : [],
355
+ verificationCommands: Array.isArray(metadata.verificationCommands) ? metadata.verificationCommands.slice(0, 20) : [],
356
+ sourceResearchCoverage: metadata.sourceResearchCoverage,
357
+ warnings: Array.isArray(metadata.warnings) ? metadata.warnings.slice(0, 20) : [],
358
+ },
359
+ };
360
+ }
361
+
362
+ function compactChangeEvaluation(changeEvaluation) {
363
+ if (!changeEvaluation || typeof changeEvaluation !== 'object' || Array.isArray(changeEvaluation)) return undefined;
364
+ return {
365
+ version: changeEvaluation.version,
366
+ format: changeEvaluation.format,
367
+ status: changeEvaluation.status,
368
+ accepted: changeEvaluation.accepted,
369
+ reason: truncate(changeEvaluation.reason || '', 500),
370
+ editCount: changeEvaluation.editCount,
371
+ predictedEditCount: changeEvaluation.predictedEditCount,
372
+ unpredictedEditCount: changeEvaluation.unpredictedEditCount,
373
+ confirmedPredictionCount: changeEvaluation.confirmedPredictionCount,
374
+ contradictedPredictionCount: changeEvaluation.contradictedPredictionCount,
375
+ unverifiedPredictionCount: changeEvaluation.unverifiedPredictionCount,
376
+ decisionCoveragePercent: changeEvaluation.decisionCoveragePercent,
377
+ regressionCycleCount: changeEvaluation.regressionCycleCount,
378
+ broadRegressionFailureCount: changeEvaluation.broadRegressionFailureCount,
379
+ predictions: Array.isArray(changeEvaluation.predictions) ? changeEvaluation.predictions.slice(0, 20) : [],
380
+ unpredictedEdits: Array.isArray(changeEvaluation.unpredictedEdits) ? changeEvaluation.unpredictedEdits.slice(0, 20) : [],
381
+ regressionCycles: Array.isArray(changeEvaluation.regressionCycles) ? changeEvaluation.regressionCycles.slice(0, 20) : [],
382
+ recommendedAction: truncate(changeEvaluation.recommendedAction || '', 500),
383
+ };
384
+ }
385
+
386
+ function compactSubmissionBundleManifest(manifest) {
387
+ if (!manifest || typeof manifest !== 'object' || Array.isArray(manifest)) return undefined;
388
+ return {
389
+ version: manifest.version,
390
+ format: manifest.format,
391
+ submissionReady: manifest.submissionReady,
392
+ reason: truncate(manifest.reason || '', 500),
393
+ officialResultRequired: manifest.officialResultRequired,
394
+ missingOfficialFields: Array.isArray(manifest.missingOfficialFields) ? manifest.missingOfficialFields.slice(0, 20) : [],
395
+ benchmark: manifest.benchmark,
396
+ benchmarkName: manifest.benchmarkName,
397
+ sessionId: manifest.sessionId,
398
+ provider: manifest.provider,
399
+ model: manifest.model,
400
+ summaryContainer: manifest.summaryContainer,
401
+ artifacts: Array.isArray(manifest.artifacts) ? manifest.artifacts.slice(0, 50) : [],
402
+ verification: manifest.verification,
403
+ usage: manifest.usage,
404
+ process: manifest.process,
405
+ leaderboardDraft: manifest.leaderboardDraft,
406
+ };
407
+ }
408
+
409
+ function compactExperienceCard(card) {
410
+ if (!card || typeof card !== 'object' || Array.isArray(card)) return undefined;
411
+ return {
412
+ version: card.version,
413
+ replayCheckpoints: Array.isArray(card.replayCheckpoints) ? card.replayCheckpoints.slice(0, 20) : [],
414
+ failureSignatures: Array.isArray(card.failureSignatures) ? card.failureSignatures.slice(0, 10) : [],
415
+ sourceResearchCoverage: card.sourceResearchCoverage,
416
+ taskContract: card.taskContract,
417
+ taskAlignment: compactRiskSignalBlock(card.taskAlignment),
418
+ specCompliance: compactRiskSignalBlock(card.specCompliance),
419
+ rewardHack: compactRiskSignalBlock(card.rewardHack),
420
+ longHorizon: compactRiskSignalBlock(card.longHorizon),
421
+ proactivity: compactProactivity(card.proactivity),
422
+ environmentReconstruction: compactEnvironmentReconstruction(card.environmentReconstruction),
423
+ dependencyUpgrade: compactDependencyUpgrade(card.dependencyUpgrade),
424
+ decisionObservability: compactDecisionObservability(card.decisionObservability),
425
+ validationReliability: compactValidationReliability(card.validationReliability),
426
+ contextUtilization: compactContextUtilization(card.contextUtilization),
427
+ runEfficiency: compactRunEfficiency(card.runEfficiency),
428
+ verificationCommands: Array.isArray(card.verificationCommands) ? card.verificationCommands.slice(0, 20) : [],
429
+ changedFiles: Array.isArray(card.changedFiles) ? card.changedFiles.slice(0, 100) : [],
430
+ warnings: Array.isArray(card.warnings) ? card.warnings.slice(0, 20) : [],
431
+ };
432
+ }
433
+
434
+ function compactRiskSignalBlock(block) {
435
+ if (!block || typeof block !== 'object' || Array.isArray(block)) return undefined;
436
+ return {
437
+ risk: block.risk,
438
+ signalCount: block.signalCount,
439
+ signals: Array.isArray(block.signals) ? block.signals.slice(0, 20) : [],
440
+ };
441
+ }
442
+
443
+ function compactProactivity(proactivity) {
444
+ if (!proactivity || typeof proactivity !== 'object' || Array.isArray(proactivity)) return undefined;
445
+ return {
446
+ detected: proactivity.detected,
447
+ risk: proactivity.risk,
448
+ signalCount: proactivity.signalCount,
449
+ signals: Array.isArray(proactivity.signals) ? proactivity.signals.slice(0, 20) : [],
450
+ contextContract: proactivity.contextContract,
451
+ hiddenIntentEvidence: proactivity.hiddenIntentEvidence,
452
+ clarificationEvidence: proactivity.clarificationEvidence,
453
+ privacyEvidence: proactivity.privacyEvidence,
454
+ completionEvidence: proactivity.completionEvidence,
455
+ actionCount: proactivity.actionCount,
456
+ };
457
+ }
458
+
459
+ function compactRunEfficiency(runEfficiency) {
460
+ if (!runEfficiency || typeof runEfficiency !== 'object' || Array.isArray(runEfficiency)) return undefined;
461
+ return {
462
+ toolCallCount: runEfficiency.toolCallCount,
463
+ totalToolElapsedMs: runEfficiency.totalToolElapsedMs,
464
+ maxToolElapsedMs: runEfficiency.maxToolElapsedMs,
465
+ slowToolCallCount: runEfficiency.slowToolCallCount,
466
+ usageCallCount: runEfficiency.usageCallCount,
467
+ totalTokens: runEfficiency.totalTokens,
468
+ estimatedCostUsd: runEfficiency.estimatedCostUsd,
469
+ successfulVerificationCount: runEfficiency.successfulVerificationCount,
470
+ processScore: runEfficiency.processScore,
471
+ processDefectCount: runEfficiency.processDefectCount,
472
+ warningCount: runEfficiency.warningCount,
473
+ invalidToolActionCount: runEfficiency.invalidToolActionCount,
474
+ invalidToolActionPercent: runEfficiency.invalidToolActionPercent,
475
+ costEfficiencyRisk: runEfficiency.costEfficiencyRisk,
476
+ timeEfficiencyRisk: runEfficiency.timeEfficiencyRisk,
477
+ slowToolEvents: Array.isArray(runEfficiency.slowToolEvents) ? runEfficiency.slowToolEvents.slice(0, 12) : [],
478
+ };
479
+ }
480
+
481
+ function compactContextUtilization(contextUtilization) {
482
+ if (!contextUtilization || typeof contextUtilization !== 'object' || Array.isArray(contextUtilization)) return undefined;
483
+ return {
484
+ inspectCount: contextUtilization.inspectCount,
485
+ hitCount: contextUtilization.hitCount,
486
+ missCount: contextUtilization.missCount,
487
+ utilizationPercent: contextUtilization.utilizationPercent,
488
+ risk: contextUtilization.risk,
489
+ missEvents: Array.isArray(contextUtilization.missEvents) ? contextUtilization.missEvents.slice(0, 12) : [],
490
+ };
491
+ }
492
+
493
+ function compactValidationReliability(validationReliability) {
494
+ if (!validationReliability || typeof validationReliability !== 'object' || Array.isArray(validationReliability)) return undefined;
495
+ return {
496
+ lastEditSeq: validationReliability.lastEditSeq,
497
+ finalEditVerificationCount: validationReliability.finalEditVerificationCount,
498
+ finalEditPassingVerificationCount: validationReliability.finalEditPassingVerificationCount,
499
+ stableValidationAfterLastEdit: validationReliability.stableValidationAfterLastEdit,
500
+ broadValidationAfterLastEdit: validationReliability.broadValidationAfterLastEdit,
501
+ passingBroadValidationAfterLastEdit: validationReliability.passingBroadValidationAfterLastEdit,
502
+ ciValidationAfterLastEdit: validationReliability.ciValidationAfterLastEdit,
503
+ passingCiValidationAfterLastEdit: validationReliability.passingCiValidationAfterLastEdit,
504
+ postEditRegressionCycleCount: validationReliability.postEditRegressionCycleCount,
505
+ lastPostEditVerificationSeq: validationReliability.lastPostEditVerificationSeq,
506
+ lastPostEditVerificationStatus: validationReliability.lastPostEditVerificationStatus,
507
+ finalVerifierCommands: Array.isArray(validationReliability.finalVerifierCommands) ? validationReliability.finalVerifierCommands.slice(0, 12) : [],
508
+ };
509
+ }
510
+
511
+ function compactDecisionObservability(decisionObservability) {
512
+ if (!decisionObservability || typeof decisionObservability !== 'object' || Array.isArray(decisionObservability)) return undefined;
513
+ return {
514
+ editCount: decisionObservability.editCount,
515
+ predictedEditCount: decisionObservability.predictedEditCount,
516
+ verifiedPredictionCount: decisionObservability.verifiedPredictionCount,
517
+ editPredictions: Array.isArray(decisionObservability.editPredictions) ? decisionObservability.editPredictions.slice(0, 12) : [],
518
+ };
519
+ }
520
+
521
+ function compactEnvironmentReconstruction(environmentReconstruction) {
522
+ if (!environmentReconstruction || typeof environmentReconstruction !== 'object' || Array.isArray(environmentReconstruction)) return undefined;
523
+ return {
524
+ setupFailureCount: environmentReconstruction.setupFailureCount,
525
+ unresolvedSetupFailureCount: environmentReconstruction.unresolvedSetupFailureCount,
526
+ setupCount: environmentReconstruction.setupCount,
527
+ successfulSetupCount: environmentReconstruction.successfulSetupCount,
528
+ setupEvents: Array.isArray(environmentReconstruction.setupEvents) ? environmentReconstruction.setupEvents.slice(0, 12) : [],
529
+ setupFailures: Array.isArray(environmentReconstruction.setupFailures) ? environmentReconstruction.setupFailures.slice(0, 12) : [],
530
+ unresolvedSetupFailures: Array.isArray(environmentReconstruction.unresolvedSetupFailures) ? environmentReconstruction.unresolvedSetupFailures.slice(0, 12) : [],
531
+ };
532
+ }
533
+
534
+ function compactDependencyUpgrade(dependencyUpgrade) {
535
+ if (!dependencyUpgrade || typeof dependencyUpgrade !== 'object' || Array.isArray(dependencyUpgrade)) return undefined;
536
+ return {
537
+ manifestEditCount: dependencyUpgrade.manifestEditCount,
538
+ lockfileEditCount: dependencyUpgrade.lockfileEditCount,
539
+ manifestEdits: Array.isArray(dependencyUpgrade.manifestEdits) ? dependencyUpgrade.manifestEdits.slice(0, 12) : [],
540
+ lockfileEdits: Array.isArray(dependencyUpgrade.lockfileEdits) ? dependencyUpgrade.lockfileEdits.slice(0, 12) : [],
541
+ setupAfterManifestEdit: dependencyUpgrade.setupAfterManifestEdit,
542
+ passingSetupAfterManifestEdit: dependencyUpgrade.passingSetupAfterManifestEdit,
543
+ validationAfterManifestEdit: dependencyUpgrade.validationAfterManifestEdit,
544
+ passingValidationAfterManifestEdit: dependencyUpgrade.passingValidationAfterManifestEdit,
545
+ firstSetupAfterManifestEditSeq: dependencyUpgrade.firstSetupAfterManifestEditSeq,
546
+ firstValidationAfterManifestEditSeq: dependencyUpgrade.firstValidationAfterManifestEditSeq,
547
+ };
548
+ }
549
+
550
+ function collectGitArtifactRefs(workdir, artifactRoot) {
551
+ const refs = [];
552
+ if (!workdir || !existsSync(workdir)) return refs;
553
+
554
+ const gitCheck = spawnSync('git', ['-C', workdir, 'rev-parse', '--is-inside-work-tree'], {
555
+ encoding: 'utf8',
556
+ timeout: 5000,
557
+ maxBuffer: 64 * 1024,
558
+ });
559
+ if (gitCheck.status !== 0 || !String(gitCheck.stdout || '').trim().startsWith('true')) {
560
+ return refs;
561
+ }
562
+
563
+ const diff = buildWorktreePatch(workdir);
564
+ if (diff.trim()) {
565
+ const patchPath = join(artifactRoot, 'cawdex.patch');
566
+ writeFileSync(patchPath, redact(diff), 'utf8');
567
+ refs.push({
568
+ kind: 'patch',
569
+ path: patchPath,
570
+ contentType: 'text/x-diff',
571
+ description: 'Redacted git diff after Cawdex run.',
572
+ });
573
+ }
574
+
575
+ const status = spawnSync('git', ['-C', workdir, 'status', '--short'], {
576
+ encoding: 'utf8',
577
+ timeout: 5000,
578
+ maxBuffer: 512 * 1024,
579
+ });
580
+ if (status.stdout && status.stdout.trim()) {
581
+ const statusPath = join(artifactRoot, 'git-status.txt');
582
+ writeFileSync(statusPath, redact(status.stdout), 'utf8');
583
+ refs.push({
584
+ kind: 'git-status',
585
+ path: statusPath,
586
+ contentType: 'text/plain',
587
+ description: 'Redacted git status after Cawdex run.',
588
+ });
589
+ }
590
+
591
+ return refs;
592
+ }
593
+
594
+ function runGit(workdir, args, options = {}) {
595
+ const result = spawnSync('git', ['-C', workdir, ...args], {
596
+ encoding: 'utf8',
597
+ timeout: options.timeout || 5000,
598
+ maxBuffer: options.maxBuffer || 1024 * 1024,
599
+ });
600
+ if (result.error) return '';
601
+ if (result.status !== 0 && !(options.allowDiffExit && result.status === 1)) return '';
602
+ return result.stdout || '';
603
+ }
604
+
605
+ function buildWorktreePatch(workdir) {
606
+ const parts = [
607
+ runGit(workdir, ['diff', '--binary', '--no-ext-diff'], { timeout: 10000, maxBuffer: 20 * 1024 * 1024 }),
608
+ runGit(workdir, ['diff', '--cached', '--binary', '--no-ext-diff'], { timeout: 10000, maxBuffer: 20 * 1024 * 1024 }),
609
+ ...collectUntrackedDiffs(workdir),
610
+ ].map((part) => part.trim()).filter(Boolean);
611
+ return parts.join('\n\n') + (parts.length ? '\n' : '');
612
+ }
613
+
614
+ function collectUntrackedDiffs(workdir) {
615
+ const raw = runGit(workdir, ['ls-files', '--others', '--exclude-standard', '-z']);
616
+ const files = raw.split('\0').map((file) => file.trim()).filter(Boolean).slice(0, 80);
617
+ return files
618
+ .map((file) => runGit(workdir, ['diff', '--no-index', '--binary', '--no-ext-diff', '--', '/dev/null', file], {
619
+ timeout: 5000,
620
+ maxBuffer: 5 * 1024 * 1024,
621
+ allowDiffExit: true,
622
+ }))
623
+ .filter((diff) => diff.trim());
624
+ }
625
+
626
+ let payload;
627
+ try {
628
+ payload = readInput();
629
+ } catch (err) {
630
+ fail('invalid_adapter', `Could not read adapter input: ${err?.message || err}`);
631
+ process.exit(0);
632
+ }
633
+
634
+ if (payload.mode !== 'task') {
635
+ fail('unsupported_capability', 'Cawdex KBench adapter currently supports task mode only.');
636
+ process.exit(0);
637
+ }
638
+
639
+ const task = payload.task || {};
640
+ const config = payload.config || {};
641
+ const env = payload.env || task.env || {};
642
+ const taskEnv = task.env || {};
643
+ const benchmark = task.benchmark || 'swe';
644
+ const instruction = String(task.instruction || '').trim();
645
+ const profile = profileForBenchmark(benchmark);
646
+ const prompt = `/benchmark ${profile} ${instruction}`;
647
+ const workdir = config.workDir || env.workdir || env.repoPath || taskEnv.workdir || taskEnv.repoPath || process.cwd();
648
+ const artifactRoot = config.storeDir
649
+ || process.env.CAWDEX_KBENCH_ARTIFACT_DIR
650
+ || (() => {
651
+ const dir = join(tmpdir(), `cawdex-kbench-${process.pid}-${Date.now()}`);
652
+ mkdirSync(dir, { recursive: true });
653
+ return dir;
654
+ })();
655
+ mkdirSync(artifactRoot, { recursive: true });
656
+
657
+ const stdoutPath = join(artifactRoot, 'cawdex.stdout.txt');
658
+ const stderrPath = join(artifactRoot, 'cawdex.stderr.txt');
659
+ const instructionPath = join(artifactRoot, 'instruction.txt');
660
+ const traceDir = join(artifactRoot, 'cawdex-trace');
661
+ mkdirSync(dirname(stdoutPath), { recursive: true });
662
+ writeFileSync(instructionPath, redact(instruction), 'utf8');
663
+
664
+ const commandParts = splitCommand(process.env.CAWDEX_KBENCH_COMMAND || process.env.CAWDEX_KBENCH_COMMAND || 'cawdex');
665
+ if (!commandParts.length) {
666
+ fail('invalid_adapter', 'CAWDEX_KBENCH_COMMAND resolved to an empty command.');
667
+ process.exit(0);
668
+ }
669
+
670
+ const [command, ...prefixArgs] = commandParts;
671
+ const args = [
672
+ ...prefixArgs,
673
+ '--prompt', prompt,
674
+ '--perm', process.env.CAWDEX_KBENCH_PERMISSION || 'yolo',
675
+ '--output-format', 'text',
676
+ '--benchmark-trace-dir', traceDir,
677
+ ];
678
+ if (config.modelName) args.push('--model', String(config.modelName));
679
+ if (config.temperature !== undefined) args.push('--temperature', String(config.temperature));
680
+ if (config.baseUrl) args.push('--base-url', String(config.baseUrl));
681
+ if (config.apiKeyEnv) args.push('--api-key-env', String(config.apiKeyEnv));
682
+ if (process.env.CAWDEX_KBENCH_EXTRA_ARGS) {
683
+ args.push(...splitCommand(process.env.CAWDEX_KBENCH_EXTRA_ARGS));
684
+ }
685
+
686
+ const childEnv = {
687
+ ...process.env,
688
+ CAWDEX_BENCHMARK_TRACE: '1',
689
+ CAWDEX_BENCHMARK_TRACE_DIR: traceDir,
690
+ CAWDEX_BASH_TIMEOUT_MS: process.env.CAWDEX_BASH_TIMEOUT_MS || '300000',
691
+ };
692
+ for (const [key, value] of Object.entries(env.envVars || {})) {
693
+ if (typeof value === 'string') childEnv[key] = value;
694
+ }
695
+ for (const [key, value] of Object.entries(taskEnv.envVars || {})) {
696
+ if (typeof value === 'string') childEnv[key] = value;
697
+ }
698
+
699
+ const result = spawnSync(command, args, {
700
+ cwd: existsSync(workdir) ? workdir : process.cwd(),
701
+ env: childEnv,
702
+ encoding: 'utf8',
703
+ timeout: typeof config.timeoutMs === 'number' && config.timeoutMs > 0 ? config.timeoutMs : undefined,
704
+ maxBuffer: 20 * 1024 * 1024,
705
+ });
706
+
707
+ const stdout = result.stdout || '';
708
+ const stderr = result.stderr || '';
709
+ writeFileSync(stdoutPath, redact(stdout), 'utf8');
710
+ writeFileSync(stderrPath, redact(stderr), 'utf8');
711
+ const elapsedMs = Date.now() - startedAt;
712
+ const timedOut = result.error?.code === 'ETIMEDOUT' || result.signal === 'SIGTERM';
713
+ const exitCode = typeof result.status === 'number' ? result.status : (timedOut ? 124 : 1);
714
+ const ok = exitCode === 0;
715
+ const traceRefs = collectTraceRefs(traceDir);
716
+ const traceSummary = compactTraceSummary(readLatestTraceSummary(traceDir));
717
+ const workdirUsed = existsSync(workdir) ? workdir : process.cwd();
718
+ const gitRefs = collectGitArtifactRefs(workdirUsed, artifactRoot);
719
+ const stdoutLines = stdout.split(/\r?\n/).map((line) => line.trim()).filter(Boolean);
720
+ const finalText = stdoutLines.at(-1) || stdout.trim() || (ok ? 'Cawdex completed.' : 'Cawdex produced no stdout.');
721
+ const artifacts = [
722
+ { kind: 'instruction', path: instructionPath, contentType: 'text/plain', description: 'KBench task instruction passed to Cawdex.' },
723
+ { kind: 'stdout', path: stdoutPath, contentType: 'text/plain', description: 'Cawdex stdout.' },
724
+ { kind: 'stderr', path: stderrPath, contentType: 'text/plain', description: 'Cawdex stderr.' },
725
+ ...gitRefs,
726
+ ...traceRefs,
727
+ ];
728
+
729
+ const output = {
730
+ ok,
731
+ status: ok ? 'ok' : (timedOut ? 'timeout' : 'agent_error'),
732
+ failureKind: ok ? undefined : (timedOut ? 'timeout' : `exit_${exitCode}`),
733
+ finalText: truncate(finalText, 4000),
734
+ elapsedMs,
735
+ artifacts,
736
+ trace: traceRefs.length ? { native: traceRefs } : undefined,
737
+ benchmarkResult: {
738
+ mode: 'cawdex-kbench',
739
+ benchmark,
740
+ profile,
741
+ exitCode,
742
+ workdir: workdirUsed,
743
+ traceSummary,
744
+ verificationEvidence: traceSummary?.verificationEvidence,
745
+ experienceCard: traceSummary?.experienceCard,
746
+ usage: traceSummary?.usage,
747
+ },
748
+ error: ok ? undefined : {
749
+ message: truncate(stderr.trim() || stdout.trim() || result.error?.message || `Cawdex exited with code ${exitCode}`, 2000),
750
+ },
751
+ };
752
+
753
+ emit(output);