@martinloop/mcp 0.2.7 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +49 -104
  2. package/dist/package-version.d.ts +1 -1
  3. package/dist/package-version.js +1 -1
  4. package/dist/prompts.d.ts +1 -1
  5. package/dist/resources.d.ts +1 -1
  6. package/dist/resources.js +2 -2
  7. package/dist/server-validation.d.ts +1 -0
  8. package/dist/server-validation.js +8 -0
  9. package/dist/server.js +87 -9
  10. package/dist/tools/doctor.d.ts +39 -1
  11. package/dist/tools/doctor.js +68 -9
  12. package/dist/tools/eval.js +3 -2
  13. package/dist/tools/get-run.d.ts +3 -0
  14. package/dist/tools/get-run.js +3 -1
  15. package/dist/tools/get-verification-results.d.ts +3 -0
  16. package/dist/tools/get-verification-results.js +3 -1
  17. package/dist/tools/plan.js +4 -2
  18. package/dist/tools/pr-tools.js +2 -1
  19. package/dist/tools/preflight.d.ts +41 -1
  20. package/dist/tools/preflight.js +74 -19
  21. package/dist/tools/run-dossier.d.ts +3 -0
  22. package/dist/tools/run-dossier.js +5 -2
  23. package/dist/tools/run-loop.d.ts +7 -2
  24. package/dist/tools/run-loop.js +67 -35
  25. package/dist/tools/run-store.js +67 -15
  26. package/dist/tools/tool-errors.js +1 -1
  27. package/dist/tools/tool-support.d.ts +8 -3
  28. package/dist/tools/tool-support.js +61 -18
  29. package/dist/tools/workflow-governance.d.ts +19 -3
  30. package/dist/tools/workflow-governance.js +107 -55
  31. package/dist/vendor/adapters/claude-cli.d.ts +45 -3
  32. package/dist/vendor/adapters/claude-cli.js +465 -45
  33. package/dist/vendor/adapters/cli-bridge.d.ts +46 -0
  34. package/dist/vendor/adapters/cli-bridge.js +147 -38
  35. package/dist/vendor/adapters/codex-launcher.d.ts +76 -0
  36. package/dist/vendor/adapters/codex-launcher.js +538 -0
  37. package/dist/vendor/adapters/index.d.ts +3 -2
  38. package/dist/vendor/adapters/index.js +3 -2
  39. package/dist/vendor/adapters/openai-compatible.d.ts +19 -4
  40. package/dist/vendor/adapters/openai-compatible.js +50 -19
  41. package/dist/vendor/adapters/runtime-support.d.ts +3 -0
  42. package/dist/vendor/adapters/runtime-support.js +9 -1
  43. package/dist/vendor/adapters/stub-direct-provider.js +3 -0
  44. package/dist/vendor/adapters/verifier-only.d.ts +2 -0
  45. package/dist/vendor/adapters/verifier-only.js +11 -4
  46. package/dist/vendor/contracts/index.d.ts +39 -0
  47. package/dist/vendor/contracts/index.js +2 -0
  48. package/dist/vendor/core/context-integrity.js +28 -3
  49. package/dist/vendor/core/grounding.d.ts +1 -0
  50. package/dist/vendor/core/grounding.js +6 -2
  51. package/dist/vendor/core/index.d.ts +24 -3
  52. package/dist/vendor/core/index.js +113 -21
  53. package/dist/vendor/core/leash.js +85 -8
  54. package/dist/vendor/core/persistence/index.d.ts +2 -0
  55. package/dist/vendor/core/persistence/index.js +1 -0
  56. package/dist/vendor/core/persistence/integrity.d.ts +38 -0
  57. package/dist/vendor/core/persistence/integrity.js +248 -0
  58. package/dist/vendor/core/persistence/store.d.ts +7 -0
  59. package/dist/vendor/core/persistence/store.js +25 -1
  60. package/dist/vendor/core/policy.d.ts +9 -0
  61. package/dist/workflow-state.d.ts +9 -0
  62. package/dist/workflow-state.js +46 -3
  63. package/package.json +2 -2
  64. package/server.json +2 -2
@@ -11,7 +11,8 @@
11
11
  *
12
12
  * MCP tools and integration tests use the same factories.
13
13
  */
14
- import { readGitExecutionArtifacts, runSubprocess, runVerification } from "./cli-bridge.js";
14
+ import { readGitExecutionArtifacts, resolveGitRepositoryRoot, runSubprocess, runVerification } from "./cli-bridge.js";
15
+ import { buildCodexExecArgs } from "./codex-launcher.js";
15
16
  import { createAdapterCapabilities, normalizeStructuredErrors, normalizeUsage } from "./runtime-support.js";
16
17
  // ---------------------------------------------------------------------------
17
18
  // Cost estimation
@@ -31,7 +32,14 @@ const MODEL_PRICING = {
31
32
  // Keep legacy names working
32
33
  "claude-opus": { inputPer1K: 0.015, outputPer1K: 0.075 },
33
34
  "claude-sonnet": { inputPer1K: 0.003, outputPer1K: 0.015 },
34
- "claude-haiku": { inputPer1K: 0.00025, outputPer1K: 0.00125 }
35
+ "claude-haiku": { inputPer1K: 0.00025, outputPer1K: 0.00125 },
36
+ // OpenAI coding models
37
+ "codex": { inputPer1K: 0.00125, cachedInputPer1K: 0.000125, outputPer1K: 0.01 },
38
+ "gpt-5-codex": { inputPer1K: 0.00125, cachedInputPer1K: 0.000125, outputPer1K: 0.01 },
39
+ "gpt-5.1-codex": { inputPer1K: 0.00125, cachedInputPer1K: 0.000125, outputPer1K: 0.01 },
40
+ "gpt-5.1-codex-max": { inputPer1K: 0.00125, cachedInputPer1K: 0.000125, outputPer1K: 0.01 },
41
+ "gpt-5.2-codex": { inputPer1K: 0.00175, cachedInputPer1K: 0.000175, outputPer1K: 0.014 },
42
+ "codex-mini-latest": { inputPer1K: 0.0015, cachedInputPer1K: 0.000375, outputPer1K: 0.006 }
35
43
  };
36
44
  function extractUsage(parsed, modelLabel) {
37
45
  if (!parsed?.usage) {
@@ -42,21 +50,275 @@ function extractUsage(parsed, modelLabel) {
42
50
  provenance: "unavailable"
43
51
  });
44
52
  }
45
- const tokensIn = (parsed.usage.inputTokens ?? parsed.usage.input_tokens ?? 0) +
46
- (parsed.usage.cacheReadInputTokens ?? parsed.usage.cache_read_input_tokens ?? 0) +
53
+ const promptTokens = (parsed.usage.inputTokens ?? parsed.usage.input_tokens ?? 0) +
47
54
  (parsed.usage.cacheCreationInputTokens ?? parsed.usage.cache_creation_input_tokens ?? 0);
55
+ const cachedInputTokens = parsed.usage.cacheReadInputTokens ?? parsed.usage.cache_read_input_tokens ?? 0;
56
+ const tokensIn = promptTokens + cachedInputTokens;
48
57
  const tokensOut = parsed.usage.outputTokens ?? parsed.usage.output_tokens ?? 0;
49
58
  const pricing = (modelLabel ? MODEL_PRICING[modelLabel] : undefined) ??
50
59
  { inputPer1K: BLENDED_INPUT_COST_PER_1K, outputPer1K: BLENDED_OUTPUT_COST_PER_1K };
51
- const actualUsd = (tokensIn / 1000) * pricing.inputPer1K +
52
- (tokensOut / 1000) * pricing.outputPer1K;
60
+ // Prefer Claude's own authoritative total_cost_usd (present on the final
61
+ // `result` event in json/stream-json output) over our pricing-table estimate,
62
+ // which can drift from real billed cost (cache discounts, surcharges, etc).
63
+ const hasAuthoritativeCost = typeof parsed.total_cost_usd === "number";
64
+ const actualUsd = hasAuthoritativeCost
65
+ ? parsed.total_cost_usd
66
+ : (promptTokens / 1000) * pricing.inputPer1K +
67
+ (cachedInputTokens / 1000) * (pricing.cachedInputPer1K ?? pricing.inputPer1K) +
68
+ (tokensOut / 1000) * pricing.outputPer1K;
53
69
  return normalizeUsage({
54
70
  actualUsd: Number(actualUsd.toFixed(6)),
55
71
  tokensIn,
56
72
  tokensOut,
57
- provenance: "actual"
73
+ cachedInputTokens,
74
+ provenance: hasAuthoritativeCost ? "actual" : "estimated",
75
+ providerSettlement: {
76
+ providerId: "claude",
77
+ model: modelLabel ?? "claude",
78
+ transport: "cli",
79
+ source: "claude_json",
80
+ inputTokens: promptTokens,
81
+ cachedInputTokens,
82
+ outputTokens: tokensOut,
83
+ rawUsageAvailable: true,
84
+ settledAt: new Date().toISOString()
85
+ }
58
86
  });
59
87
  }
88
+ function extractCodexJsonlResult(stdout, modelLabel) {
89
+ const events = stdout
90
+ .split(/\r?\n/u)
91
+ .map((line) => line.trim())
92
+ .filter(Boolean)
93
+ .map((line) => {
94
+ try {
95
+ return JSON.parse(line);
96
+ }
97
+ catch {
98
+ return undefined;
99
+ }
100
+ })
101
+ .filter((event) => event !== undefined);
102
+ if (events.length === 0) {
103
+ return undefined;
104
+ }
105
+ const latestAgentMessage = [...events]
106
+ .reverse()
107
+ .find((event) => event.type === "item.completed" && event.item?.type === "agent_message");
108
+ const latestTurnCompleted = [...events]
109
+ .reverse()
110
+ .find((event) => event.type === "turn.completed" && event.usage !== undefined);
111
+ const summary = typeof latestAgentMessage?.item?.text === "string" && latestAgentMessage.item.text.trim().length > 0
112
+ ? latestAgentMessage.item.text.trim()
113
+ : stdout.trim();
114
+ if (!latestTurnCompleted?.usage) {
115
+ return {
116
+ summary,
117
+ usage: normalizeUsage({
118
+ actualUsd: 0,
119
+ tokensIn: 0,
120
+ tokensOut: 0,
121
+ provenance: "unavailable",
122
+ providerSettlement: {
123
+ providerId: "codex",
124
+ model: modelLabel ?? "codex",
125
+ transport: "cli",
126
+ source: "unavailable",
127
+ inputTokens: 0,
128
+ outputTokens: 0,
129
+ rawUsageAvailable: false,
130
+ settledAt: new Date().toISOString()
131
+ }
132
+ })
133
+ };
134
+ }
135
+ const promptTokens = latestTurnCompleted.usage.input_tokens ?? 0;
136
+ const cachedInputTokens = latestTurnCompleted.usage.cached_input_tokens ?? 0;
137
+ const outputTokens = latestTurnCompleted.usage.output_tokens ?? 0;
138
+ const reasoningOutputTokens = latestTurnCompleted.usage.reasoning_output_tokens ?? 0;
139
+ const tokensIn = promptTokens + cachedInputTokens;
140
+ const tokensOut = outputTokens + reasoningOutputTokens;
141
+ const pricing = (modelLabel ? MODEL_PRICING[modelLabel] : undefined) ??
142
+ MODEL_PRICING["codex"] ??
143
+ { inputPer1K: BLENDED_INPUT_COST_PER_1K, outputPer1K: BLENDED_OUTPUT_COST_PER_1K };
144
+ const actualUsd = (promptTokens / 1000) * pricing.inputPer1K +
145
+ (cachedInputTokens / 1000) * (pricing.cachedInputPer1K ?? pricing.inputPer1K) +
146
+ (tokensOut / 1000) * pricing.outputPer1K;
147
+ return {
148
+ summary,
149
+ usage: normalizeUsage({
150
+ actualUsd: Number(actualUsd.toFixed(6)),
151
+ tokensIn,
152
+ tokensOut,
153
+ cachedInputTokens,
154
+ reasoningTokensOut: reasoningOutputTokens,
155
+ provenance: "actual",
156
+ providerSettlement: {
157
+ providerId: "codex",
158
+ model: modelLabel ?? "codex",
159
+ transport: "cli",
160
+ source: "codex_jsonl",
161
+ inputTokens: promptTokens,
162
+ cachedInputTokens,
163
+ outputTokens,
164
+ reasoningOutputTokens,
165
+ rawUsageAvailable: true,
166
+ settledAt: new Date().toISOString()
167
+ }
168
+ })
169
+ };
170
+ }
171
+ function extractGeminiJsonResult(stdout, modelLabel) {
172
+ let parsed;
173
+ try {
174
+ parsed = JSON.parse(stdout);
175
+ }
176
+ catch {
177
+ return undefined;
178
+ }
179
+ const summary = typeof parsed.response === "string" && parsed.response.trim().length > 0
180
+ ? parsed.response.trim()
181
+ : typeof parsed.error?.message === "string" && parsed.error.message.trim().length > 0
182
+ ? parsed.error.message.trim()
183
+ : stdout.trim();
184
+ const promptTokens = parsed.stats?.inputTokens ?? 0;
185
+ const cachedInputTokens = parsed.stats?.cachedReadTokens ?? 0;
186
+ const outputTokens = parsed.stats?.outputTokens ?? 0;
187
+ const reasoningOutputTokens = parsed.stats?.thoughtTokens ?? 0;
188
+ const hasUsage = parsed.stats !== undefined &&
189
+ (promptTokens > 0 || cachedInputTokens > 0 || outputTokens > 0 || reasoningOutputTokens > 0);
190
+ if (!hasUsage) {
191
+ return {
192
+ summary,
193
+ usage: normalizeUsage({
194
+ actualUsd: 0,
195
+ tokensIn: 0,
196
+ tokensOut: 0,
197
+ provenance: "unavailable",
198
+ providerSettlement: {
199
+ providerId: "gemini",
200
+ model: modelLabel ?? "flash",
201
+ transport: "cli",
202
+ source: "unavailable",
203
+ inputTokens: 0,
204
+ outputTokens: 0,
205
+ rawUsageAvailable: false,
206
+ settledAt: new Date().toISOString()
207
+ }
208
+ })
209
+ };
210
+ }
211
+ const tokensIn = promptTokens + cachedInputTokens;
212
+ const tokensOut = outputTokens + reasoningOutputTokens;
213
+ const pricing = (modelLabel ? MODEL_PRICING[modelLabel] : undefined) ??
214
+ { inputPer1K: BLENDED_INPUT_COST_PER_1K, outputPer1K: BLENDED_OUTPUT_COST_PER_1K };
215
+ const actualUsd = (promptTokens / 1000) * pricing.inputPer1K +
216
+ (cachedInputTokens / 1000) * (pricing.cachedInputPer1K ?? pricing.inputPer1K) +
217
+ (tokensOut / 1000) * pricing.outputPer1K;
218
+ return {
219
+ summary,
220
+ usage: normalizeUsage({
221
+ actualUsd: Number(actualUsd.toFixed(6)),
222
+ tokensIn,
223
+ tokensOut,
224
+ cachedInputTokens,
225
+ reasoningTokensOut: reasoningOutputTokens,
226
+ provenance: "actual",
227
+ providerSettlement: {
228
+ providerId: "gemini",
229
+ model: modelLabel ?? "flash",
230
+ transport: "cli",
231
+ source: "gemini_json",
232
+ inputTokens: promptTokens,
233
+ cachedInputTokens,
234
+ outputTokens,
235
+ reasoningOutputTokens,
236
+ rawUsageAvailable: true,
237
+ settledAt: new Date().toISOString()
238
+ }
239
+ })
240
+ };
241
+ }
242
+ function createStreamingUsageInspector(capUsd, modelLabel) {
243
+ const pricing = (modelLabel ? MODEL_PRICING[modelLabel] : undefined) ??
244
+ { inputPer1K: BLENDED_INPUT_COST_PER_1K, outputPer1K: BLENDED_OUTPUT_COST_PER_1K };
245
+ let buffer = "";
246
+ let cumulativeUsd = 0;
247
+ let tokensIn = 0;
248
+ let tokensOut = 0;
249
+ let turns = 0;
250
+ let finalResult;
251
+ const ingestLine = (line, terminate) => {
252
+ const trimmed = line.trim();
253
+ if (!trimmed) {
254
+ return;
255
+ }
256
+ let event;
257
+ try {
258
+ event = JSON.parse(trimmed);
259
+ }
260
+ catch {
261
+ return;
262
+ }
263
+ if (event.type === "assistant" && event.message?.usage) {
264
+ const usage = event.message.usage;
265
+ const turnTokensIn = (usage.input_tokens ?? usage.inputTokens ?? 0) +
266
+ (usage.cache_read_input_tokens ?? usage.cacheReadInputTokens ?? 0) +
267
+ (usage.cache_creation_input_tokens ?? usage.cacheCreationInputTokens ?? 0);
268
+ const turnTokensOut = usage.output_tokens ?? usage.outputTokens ?? 0;
269
+ tokensIn += turnTokensIn;
270
+ tokensOut += turnTokensOut;
271
+ turns += 1;
272
+ cumulativeUsd += (turnTokensIn / 1000) * pricing.inputPer1K + (turnTokensOut / 1000) * pricing.outputPer1K;
273
+ if (capUsd > 0 && cumulativeUsd > capUsd) {
274
+ terminate(`Streaming usage cap exceeded after ${String(turns)} turn(s): cumulative cost ~$${cumulativeUsd.toFixed(4)} ` +
275
+ `surpassed the per-attempt cap $${capUsd.toFixed(4)} (derived from remaining loop budget). ` +
276
+ `Subprocess terminated to bound runaway overspend.`);
277
+ }
278
+ return;
279
+ }
280
+ if (event.type === "result") {
281
+ finalResult = event;
282
+ }
283
+ };
284
+ return {
285
+ onChunk: (chunk, terminate) => {
286
+ buffer += chunk.toString("utf8");
287
+ let newlineIndex = buffer.indexOf("\n");
288
+ while (newlineIndex !== -1) {
289
+ const line = buffer.slice(0, newlineIndex);
290
+ buffer = buffer.slice(newlineIndex + 1);
291
+ ingestLine(line, terminate);
292
+ newlineIndex = buffer.indexOf("\n");
293
+ }
294
+ },
295
+ snapshot: () => ({ cumulativeUsd, tokensIn, tokensOut, turns, ...(finalResult ? { finalResult } : {}) })
296
+ };
297
+ }
298
+ /**
299
+ * Parses Claude's `stream-json` output (one JSON object per line) and returns
300
+ * the final `result` event, which carries the same `result`/`usage`/
301
+ * `total_cost_usd` fields as the single-blob `json` format.
302
+ */
303
+ function parseStreamJsonResult(stdout) {
304
+ let lastResult;
305
+ for (const rawLine of stdout.split(/\r?\n/u)) {
306
+ const line = rawLine.trim();
307
+ if (!line) {
308
+ continue;
309
+ }
310
+ try {
311
+ const event = JSON.parse(line);
312
+ if (event.type === "result") {
313
+ lastResult = event;
314
+ }
315
+ }
316
+ catch {
317
+ // Ignore non-JSON / partial lines.
318
+ }
319
+ }
320
+ return lastResult;
321
+ }
60
322
  // ---------------------------------------------------------------------------
61
323
  // Structural failure hint detection
62
324
  //
@@ -90,6 +352,7 @@ export function createAgentCliAdapter(options) {
90
352
  const verifyTimeoutMs = options.verifyTimeoutMs ?? 60_000;
91
353
  const adapterId = `agent-cli:${options.adapterIdSuffix ?? options.command}`;
92
354
  const supportsJsonOutput = options.supportsJsonOutput === true;
355
+ const supportsUsageSettlement = supportsJsonOutput || options.command === "codex" || options.command === "gemini";
93
356
  const adapter = {
94
357
  adapterId,
95
358
  kind: "agent-cli",
@@ -100,10 +363,10 @@ export function createAgentCliAdapter(options) {
100
363
  transport: "cli",
101
364
  capabilities: createAdapterCapabilities({
102
365
  preflight: true,
103
- usageSettlement: supportsJsonOutput,
366
+ usageSettlement: supportsUsageSettlement,
104
367
  diffArtifacts: true,
105
368
  structuredErrors: true,
106
- cachingSignals: supportsJsonOutput
369
+ cachingSignals: supportsUsageSettlement
107
370
  })
108
371
  },
109
372
  async execute(request) {
@@ -130,12 +393,45 @@ export function createAgentCliAdapter(options) {
130
393
  }
131
394
  const args = options.argsBuilder(prompt);
132
395
  const stdinData = options.stdinBuilder?.(prompt);
396
+ // Live cumulative-cost circuit breaker: a single attempt should never be
397
+ // allowed to spend more than the loop has left. `--output-format json`
398
+ // only reports usage once the process exits, so for `stream-json` we
399
+ // watch per-turn usage events as they arrive and kill the subprocess the
400
+ // instant projected spend crosses what remains — bounding the worst case
401
+ // to roughly one turn's overshoot rather than the entire runaway session.
402
+ const streamingUsage = options.streamingUsageCap && request.context.remainingBudgetUsd > 0
403
+ ? createStreamingUsageInspector(request.context.remainingBudgetUsd, options.model ?? options.command)
404
+ : undefined;
133
405
  const agentResult = await runSubprocess(options.command, args, {
134
406
  cwd: workingDirectory,
135
407
  timeoutMs,
136
408
  spawnImpl: options.spawnImpl,
137
- ...(stdinData === undefined ? {} : { stdinData })
409
+ ...(stdinData === undefined ? {} : { stdinData }),
410
+ ...(streamingUsage ? { onStdoutChunk: streamingUsage.onChunk } : {})
138
411
  });
412
+ if (agentResult.terminationReason) {
413
+ const snapshot = streamingUsage?.snapshot();
414
+ const cumulativeUsd = snapshot?.cumulativeUsd ?? 0;
415
+ return {
416
+ status: "failed",
417
+ summary: `${options.command} subprocess terminated mid-run by the budget circuit breaker. ${agentResult.terminationReason}`,
418
+ usage: normalizeUsage({
419
+ actualUsd: Number(cumulativeUsd.toFixed(6)),
420
+ estimatedUsd: Number(cumulativeUsd.toFixed(6)),
421
+ tokensIn: snapshot?.tokensIn ?? 0,
422
+ tokensOut: snapshot?.tokensOut ?? 0,
423
+ provenance: "estimated"
424
+ }),
425
+ verification: {
426
+ passed: false,
427
+ summary: "Subprocess terminated by the streaming budget circuit breaker before verification could run."
428
+ },
429
+ failure: {
430
+ message: agentResult.terminationReason,
431
+ classHint: "budget_pressure"
432
+ }
433
+ };
434
+ }
139
435
  if (agentResult.timedOut) {
140
436
  return {
141
437
  status: "failed",
@@ -170,45 +466,108 @@ export function createAgentCliAdapter(options) {
170
466
  }
171
467
  };
172
468
  }
173
- // Parse JSON output if the CLI supports it (Claude with --output-format json)
469
+ // Parse JSON output if the CLI supports it. `stream-json` emits one JSON
470
+ // object per line — the final `result` event carries the same
471
+ // `result`/`usage`/`total_cost_usd` fields as single-blob `json` output.
174
472
  let parsed;
175
473
  if (supportsJsonOutput) {
176
474
  try {
177
- parsed = JSON.parse(agentResult.stdout);
475
+ parsed = options.streamingUsageCap
476
+ ? parseStreamJsonResult(agentResult.stdout)
477
+ : JSON.parse(agentResult.stdout);
178
478
  }
179
479
  catch {
180
480
  // Fall through to plain-text handling
181
481
  }
182
482
  }
183
- const agentText = parsed?.result ?? agentResult.stdout.trim();
483
+ const codexJsonlResult = !supportsJsonOutput && options.command === "codex"
484
+ ? extractCodexJsonlResult(agentResult.stdout, options.model)
485
+ : undefined;
486
+ const geminiJsonResult = !supportsJsonOutput && options.command === "gemini"
487
+ ? extractGeminiJsonResult(agentResult.stdout, options.model)
488
+ : undefined;
489
+ const producedStructuredCompletion = parsed?.result !== undefined ||
490
+ codexJsonlResult !== undefined ||
491
+ geminiJsonResult !== undefined;
492
+ if (agentResult.exitCode !== 0 && !producedStructuredCompletion) {
493
+ const failureMessage = formatPreVerifierSubprocessFailure(options.command, agentResult.stderr || agentResult.stdout, agentResult.exitCode);
494
+ return {
495
+ status: "failed",
496
+ summary: `${options.command} subprocess exited before verifier execution.`,
497
+ usage: normalizeUsage({
498
+ actualUsd: 0,
499
+ tokensIn: 0,
500
+ tokensOut: 0,
501
+ provenance: "unavailable"
502
+ }),
503
+ verification: { passed: false, summary: `Verifier not run: ${failureMessage}` },
504
+ failure: {
505
+ message: failureMessage
506
+ }
507
+ };
508
+ }
509
+ const agentText = codexJsonlResult?.summary ??
510
+ geminiJsonResult?.summary ??
511
+ parsed?.result ??
512
+ agentResult.stdout.trim();
184
513
  const summary = truncate(agentText, 2000);
185
514
  const usage = parsed?.usage
186
515
  ? extractUsage(parsed, options.model)
187
- : normalizeUsage({
188
- actualUsd: estimatedUsage.actualUsd,
189
- estimatedUsd: estimatedUsage.actualUsd,
190
- tokensIn: estimatedUsage.tokensIn,
191
- tokensOut: Math.max(estimatedUsage.tokensOut, Math.ceil(agentText.length / 4)),
192
- provenance: "estimated"
193
- });
516
+ : codexJsonlResult?.usage ??
517
+ geminiJsonResult?.usage ??
518
+ normalizeUsage({
519
+ actualUsd: estimatedUsage.actualUsd,
520
+ estimatedUsd: estimatedUsage.actualUsd,
521
+ tokensIn: estimatedUsage.tokensIn,
522
+ tokensOut: Math.max(estimatedUsage.tokensOut, Math.ceil(agentText.length / 4)),
523
+ provenance: "estimated",
524
+ providerSettlement: options.command === "codex"
525
+ ? {
526
+ providerId: "codex",
527
+ model: options.model ?? "codex",
528
+ transport: "cli",
529
+ source: "estimated_fallback",
530
+ inputTokens: estimatedUsage.tokensIn,
531
+ outputTokens: Math.max(estimatedUsage.tokensOut, Math.ceil(agentText.length / 4)),
532
+ rawUsageAvailable: false,
533
+ settledAt: new Date().toISOString()
534
+ }
535
+ : options.command === "gemini"
536
+ ? {
537
+ providerId: "gemini",
538
+ model: options.model ?? "flash",
539
+ transport: "cli",
540
+ source: "estimated_fallback",
541
+ inputTokens: estimatedUsage.tokensIn,
542
+ outputTokens: Math.max(estimatedUsage.tokensOut, Math.ceil(agentText.length / 4)),
543
+ rawUsageAvailable: false,
544
+ settledAt: new Date().toISOString()
545
+ }
546
+ : undefined
547
+ });
194
548
  const verificationStack = request.context.verificationStack;
195
549
  const verification = await runVerification(request.context.verificationPlan, workingDirectory, verifyTimeoutMs, verificationStack, options.spawnImpl);
196
550
  // Check for zero-diff (agent ran but made no file changes)
197
551
  const repoRoot = request.context.repoRoot;
552
+ const gitRepoRoot = repoRoot ? resolveGitRepositoryRoot(repoRoot) : undefined;
198
553
  let noDiff = false;
199
- if (repoRoot) {
200
- noDiff = await checkNoDiff(repoRoot);
554
+ if (gitRepoRoot) {
555
+ noDiff = await checkNoDiff(gitRepoRoot, options.spawnImpl);
201
556
  }
202
557
  // Extract structured errors from stderr/stdout for better failure context
203
558
  const structuredErrors = normalizeStructuredErrors(extractStructuredErrors(agentResult.stderr, agentResult.stdout));
204
- const executionArtifacts = repoRoot
205
- ? await readGitExecutionArtifacts(repoRoot, 5000, options.spawnImpl)
559
+ const executionArtifacts = gitRepoRoot
560
+ ? await readGitExecutionArtifacts(gitRepoRoot, 5000, options.spawnImpl)
206
561
  : undefined;
207
562
  // Scope contract enforcement: check touched files against allowedPaths/deniedPaths
208
563
  let scopeViolations = [];
209
564
  const scopeCtx = request.context;
210
- if (repoRoot && (scopeCtx.allowedPaths?.length || scopeCtx.deniedPaths?.length)) {
211
- const diffResult = await runSubprocess("git", ["diff", "--name-only", "HEAD"], { cwd: repoRoot, timeoutMs: 5000 });
565
+ if (gitRepoRoot && (scopeCtx.allowedPaths?.length || scopeCtx.deniedPaths?.length)) {
566
+ const diffResult = await runSubprocess("git", ["diff", "--name-only", "HEAD"], {
567
+ cwd: gitRepoRoot,
568
+ timeoutMs: 5000,
569
+ spawnImpl: options.spawnImpl
570
+ });
212
571
  if (diffResult.exitCode === 0 && diffResult.stdout.trim()) {
213
572
  const touchedFiles = diffResult.stdout.trim().split("\n").filter(Boolean);
214
573
  const allowed = scopeCtx.allowedPaths ?? [];
@@ -278,7 +637,12 @@ export function createAgentCliAdapter(options) {
278
637
  }
279
638
  // Reset tracked files to HEAD so next attempt starts from clean state
280
639
  try {
281
- await runSubprocess("git", ["restore", "--staged", "--worktree", "."], { cwd: repoRoot, timeoutMs: 5000 });
640
+ if (gitRepoRoot) {
641
+ await runSubprocess("git", ["restore", "--staged", "--worktree", "."], {
642
+ cwd: gitRepoRoot,
643
+ timeoutMs: 5000
644
+ });
645
+ }
282
646
  }
283
647
  catch {
284
648
  // Non-fatal
@@ -326,10 +690,16 @@ export function createAgentCliAdapter(options) {
326
690
  // Pre-configured: Claude CLI
327
691
  // ---------------------------------------------------------------------------
328
692
  /**
329
- * Spawns `claude --output-format json --print "<prompt>" --dangerously-skip-permissions [extraArgs]`.
693
+ * Spawns `claude --output-format stream-json --verbose --print "<prompt>" [extraArgs]`.
330
694
  *
331
- * The --output-format json flag causes Claude CLI to return structured JSON
332
- * including real token usage counts, enabling accurate cost tracking.
695
+ * `stream-json` emits one JSON event per line including per-turn usage on
696
+ * each `assistant` message and a final `result` event carrying the same
697
+ * `result`/`usage`/`total_cost_usd` fields as single-blob `json` output — so
698
+ * MartinLoop can both (a) recover real token usage/cost as before, and
699
+ * (b) watch cumulative spend live and self-terminate the subprocess the
700
+ * moment it crosses the remaining per-attempt budget (see
701
+ * `streamingUsageCap` / `createStreamingUsageInspector`), instead of only
702
+ * discovering an overspend after the whole process has already exited.
333
703
  *
334
704
  * Requires the Claude Code CLI to be installed and authenticated:
335
705
  * https://docs.anthropic.com/claude-code
@@ -346,10 +716,12 @@ export function createClaudeCliAdapter(options = {}) {
346
716
  timeoutMs: options.timeoutMs,
347
717
  verifyTimeoutMs: options.verifyTimeoutMs,
348
718
  supportsJsonOutput: true,
719
+ streamingUsageCap: true,
349
720
  spawnImpl: options.spawnImpl,
350
721
  argsBuilder: (_prompt) => [
351
722
  "--output-format",
352
- "json",
723
+ "stream-json",
724
+ "--verbose",
353
725
  "--print",
354
726
  "--dangerously-skip-permissions",
355
727
  ...modelArgs,
@@ -372,12 +744,12 @@ export function createClaudeCliAdapter(options = {}) {
372
744
  * npm install -g @openai/codex
373
745
  */
374
746
  export function createCodexCliAdapter(options = {}) {
375
- const modelArgs = options.model ? ["--model", options.model] : [];
376
747
  const extraArgs = options.extraArgs ?? [];
377
748
  const sandbox = options.sandbox ?? "workspace-write";
378
749
  const workingDirectory = options.workingDirectory ?? process.cwd();
750
+ const command = options.command ?? "codex";
379
751
  return createAgentCliAdapter({
380
- command: "codex",
752
+ command,
381
753
  adapterIdSuffix: "codex",
382
754
  model: options.model ?? "codex",
383
755
  label: options.label ?? "Codex CLI adapter",
@@ -386,17 +758,53 @@ export function createCodexCliAdapter(options = {}) {
386
758
  verifyTimeoutMs: options.verifyTimeoutMs,
387
759
  supportsJsonOutput: false,
388
760
  spawnImpl: options.spawnImpl,
389
- argsBuilder: () => [
390
- "exec",
391
- "--cd",
761
+ argsBuilder: () => buildCodexExecArgs({
392
762
  workingDirectory,
393
- "--sandbox",
394
763
  sandbox,
395
- "--color",
396
- "never",
397
- ...modelArgs,
398
- ...extraArgs,
399
- "-"
764
+ ...(options.model ? { model: options.model } : {}),
765
+ extraArgs,
766
+ mode: "prompt"
767
+ }),
768
+ stdinBuilder: (prompt) => prompt
769
+ });
770
+ }
771
+ // ---------------------------------------------------------------------------
772
+ // Pre-configured: Gemini CLI
773
+ // ---------------------------------------------------------------------------
774
+ /**
775
+ * Spawns `gemini --model <model> --prompt "" --approval-mode <mode> --output-format json [...]`.
776
+ *
777
+ * The prompt is delivered via stdin while forcing headless mode with `--prompt ""`,
778
+ * which keeps large MartinLoop prompts off the command line on Windows.
779
+ *
780
+ * Requires the Gemini CLI to be installed and authenticated:
781
+ * npm install -g @google/gemini-cli
782
+ */
783
+ export function createGeminiCliAdapter(options = {}) {
784
+ const model = options.model ?? "flash";
785
+ const approvalMode = options.approvalMode ?? "yolo";
786
+ const extraArgs = options.extraArgs ?? [];
787
+ return createAgentCliAdapter({
788
+ command: "gemini",
789
+ adapterIdSuffix: "gemini",
790
+ model,
791
+ label: options.label ?? "Gemini CLI adapter",
792
+ workingDirectory: options.workingDirectory,
793
+ timeoutMs: options.timeoutMs,
794
+ verifyTimeoutMs: options.verifyTimeoutMs,
795
+ supportsJsonOutput: false,
796
+ spawnImpl: options.spawnImpl,
797
+ argsBuilder: () => [
798
+ "--model",
799
+ model,
800
+ "--prompt",
801
+ "",
802
+ "--approval-mode",
803
+ approvalMode,
804
+ ...(options.sandbox ? ["--sandbox"] : []),
805
+ "--output-format",
806
+ "json",
807
+ ...extraArgs
400
808
  ],
401
809
  stdinBuilder: (prompt) => prompt
402
810
  });
@@ -559,7 +967,15 @@ function redactSecretsForPrompt(input) {
559
967
  return input
560
968
  .replace(/\bOPENAI_API_KEY\s*=\s*[^\s"'`]+/giu, "OPENAI_API_KEY=[REDACTED_SECRET]")
561
969
  .replace(/\bsk-[A-Za-z0-9_-]{8,}\b/gu, "[REDACTED_SECRET]")
562
- .replace(/\bghp_[A-Za-z0-9_]{8,}\b/gu, "[REDACTED_SECRET]")
970
+ .replace(/\bghp_[A-Za-z0-9_]{16,}\b/gu, "[REDACTED_SECRET]")
971
+ .replace(/\bgithub_pat_[A-Za-z0-9_]{20,}\b/gu, "[REDACTED_SECRET]")
972
+ .replace(/\b(?:gho|ghu|ghs|ghr)_[A-Za-z0-9_]{16,}\b/gu, "[REDACTED_SECRET]")
973
+ .replace(/\bAKIA[0-9A-Z]{16}\b/gu, "[REDACTED_SECRET]")
974
+ .replace(/\b(?:aws_secret_access_key|AWS_SECRET_ACCESS_KEY)\s*[:=]\s*[^\s"'`]+/giu, "AWS_SECRET_ACCESS_KEY=[REDACTED_SECRET]")
975
+ .replace(/\bxox[baprs]-[A-Za-z0-9-]{10,}\b/giu, "[REDACTED_SECRET]")
976
+ .replace(/\bAIza[0-9A-Za-z_-]{30,}\b/gu, "[REDACTED_SECRET]")
977
+ .replace(/-----BEGIN(?:\s+[A-Z0-9]+)*\s+PRIVATE KEY-----[\s\S]*?-----END(?:\s+[A-Z0-9]+)*\s+PRIVATE KEY-----/gu, "[REDACTED_SECRET]")
978
+ .replace(/\beyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b/gu, "[REDACTED_SECRET]")
563
979
  .replace(/\B\.env(?!\.example\b)(?:\.[A-Za-z0-9._-]+)?\b/giu, "[REDACTED_PATH]");
564
980
  }
565
981
  function extractStructuredErrors(stderr, stdout) {
@@ -579,7 +995,11 @@ function extractStructuredErrors(stderr, stdout) {
579
995
  }
580
996
  return errors.slice(0, 10); // cap at 10 to avoid bloating prompts
581
997
  }
582
- async function checkNoDiff(repoRoot) {
583
- const result = await runSubprocess("git", ["diff", "--name-only", "HEAD"], { cwd: repoRoot, timeoutMs: 5000 });
998
+ async function checkNoDiff(repoRoot, spawnImpl) {
999
+ const result = await runSubprocess("git", ["diff", "--name-only", "HEAD"], {
1000
+ cwd: repoRoot,
1001
+ timeoutMs: 5000,
1002
+ spawnImpl
1003
+ });
584
1004
  return result.exitCode === 0 && result.stdout.trim().length === 0;
585
1005
  }