martin-loop 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -16
- package/demo/seeded-workspace/README.md +35 -0
- package/demo/seeded-workspace/TASKS.md +29 -0
- package/demo/seeded-workspace/martin.config.yaml +11 -0
- package/demo/seeded-workspace/package.json +8 -0
- package/demo/seeded-workspace/src/invoice-summary.js +11 -0
- package/demo/seeded-workspace/test/invoice-summary.test.js +20 -0
- package/dist/vendor/adapters/claude-cli.d.ts +19 -4
- package/dist/vendor/adapters/claude-cli.js +55 -24
- package/dist/vendor/adapters/cli-bridge.d.ts +1 -0
- package/dist/vendor/adapters/cli-bridge.js +154 -28
- package/dist/vendor/adapters/index.d.ts +1 -0
- package/dist/vendor/adapters/index.js +1 -0
- package/dist/vendor/adapters/verifier-only.d.ts +7 -0
- package/dist/vendor/adapters/verifier-only.js +57 -0
- package/dist/vendor/cli/index.d.ts +6 -1
- package/dist/vendor/cli/index.js +124 -7
- package/dist/vendor/contracts/index.d.ts +3 -1
- package/dist/vendor/core/compiler.d.ts +2 -0
- package/dist/vendor/core/compiler.js +10 -4
- package/dist/vendor/core/context-integrity.d.ts +26 -0
- package/dist/vendor/core/context-integrity.js +56 -0
- package/dist/vendor/core/index.d.ts +5 -2
- package/dist/vendor/core/index.js +186 -54
- package/dist/vendor/core/policy.d.ts +6 -0
- package/docs/distribution/DIRECTORY-SUBMISSIONS.md +89 -0
- package/docs/distribution/INTEGRATION-OUTREACH.md +61 -0
- package/docs/distribution/UNDER-3-CHALLENGE.md +65 -0
- package/docs/oss/CLAUDE-CODE-WALKTHROUGH.md +142 -0
- package/docs/oss/EXAMPLES.md +9 -1
- package/docs/oss/OSS-BOUNDARY-REPORT.json +3 -7
- package/docs/oss/OSS-BOUNDARY-REPORT.md +2 -2
- package/docs/oss/QUICKSTART.md +33 -3
- package/docs/oss/RALPH-LOOP-SAFETY.md +113 -0
- package/docs/oss/README.md +6 -3
- package/docs/oss/RELEASE-SURFACE-REPORT.json +1 -1
- package/docs/oss/RELEASE-SURFACE-REPORT.md +1 -1
- package/package.json +8 -2
package/README.md
CHANGED
|
@@ -2,25 +2,27 @@
|
|
|
2
2
|
|
|
3
3
|
<img src="./docs/assets/martinloop-logo.png" alt="MartinLoop" width="260">
|
|
4
4
|
|
|
5
|
-
###
|
|
5
|
+
### The cross agent governance layer for autonomous AI coding agents.⭐
|
|
6
6
|
|
|
7
7
|
[](./LICENSE)
|
|
8
8
|
[](./tsconfig.base.json)
|
|
9
9
|
[](#quick-start)
|
|
10
10
|
[](https://www.npmjs.com/package/martin-loop)
|
|
11
11
|
|
|
12
|
+
MartinLoop has been accepted into the NVIDIA Inception program.
|
|
13
|
+
|
|
12
14
|
<br>
|
|
13
15
|
|
|
14
16
|
**Your overnight AI pipeline estimated $2.40.**
|
|
15
17
|
**You woke up to a $65 bill.**
|
|
16
18
|
<br> 47 retries. No hard stop. No rollback. No audit trail. Nothing merged.
|
|
17
19
|
MartinLoop exists so that never happens again.✅ <br> <br>
|
|
18
|
-
If you think autonomous AI coding agents need budgets, brakes, and receipts, ⭐ the repo so more builders can find it.
|
|
20
|
+
If you think autonomous AI coding agents need budgets, brakes, and receipts, Plase star ⭐ the repo so more builders can find it.
|
|
19
21
|
<br>
|
|
20
22
|
|
|
21
23
|
> AI coding agents are useful. Unbounded retry loops are not.
|
|
22
24
|
>
|
|
23
|
-
> MartinLoop wraps agent runs with budgets, policy checks, verifier gates, rollback evidence, and inspectable run records.
|
|
25
|
+
> MartinLoop wraps agent runs with budgets, policy checks, verifier gates, rollback evidence, and inspectable run records. Built for Enterprise Coding Agents, Agentic Teams, and Autonomous Companies.
|
|
24
26
|
<br>
|
|
25
27
|
<img src="./docs/assets/cli-animated.svg" alt="MartinLoop CLI — governed agent run" width="720">
|
|
26
28
|
|
|
@@ -58,6 +60,7 @@ It does not try to replace the agent pattern. It makes that pattern safe to run.
|
|
|
58
60
|
| Verifier gate | A run only reaches `completed` when the adapter result and verifier state pass. Unsafe verifier commands are blocked before agent execution. |
|
|
59
61
|
| Failure taxonomy | Classifies failures across 11 current classes, including hallucination, test regression, scope creep, repo grounding failure, environment mismatch, and budget pressure, that distinguishes real success from unsafe, invalid, or terminal behavior.|
|
|
60
62
|
| Safety leash | Evaluates verifier commands, file scope, dependency or migration changes that require approval, and secret-like values in task text. **Policy-as-code**. |
|
|
63
|
+
| Context integrity | Scans user prompts and tool output for injection patterns (authority inversion, instruction override, identity redefinition) before any attempt is admitted. Aborts with human escalation on detection. |
|
|
61
64
|
| Rollback evidence | Captures rollback boundaries and restore outcomes for repo-backed attempts when a persistence store is configured. |
|
|
62
65
|
| Context distillation | Carries a distilled summary of recent attempts and remaining constraints into subsequent attempts. |
|
|
63
66
|
| Run records | The CLI appends JSONL loop records under `~/.martin/runs/<workspaceId>.jsonl`; lower-level stores can also persist contracts, ledgers, and attempt artifacts.
|
|
@@ -66,7 +69,7 @@ It does not try to replace the agent pattern. It makes that pattern safe to run.
|
|
|
66
69
|
⭐The result is a runtime that can complete good work, refuse unsafe work, stop uneconomical work, and leave evidence behind.✅
|
|
67
70
|
---
|
|
68
71
|
|
|
69
|
-
##
|
|
72
|
+
## Ralph-Style Loops Need a Control Layer
|
|
70
73
|
|
|
71
74
|
**"Everybody has gotten infatuated with what we call these Ralph Wiggum loops, just like send the thing off and it'll just go figure something out..A, It never figures anything out. And B, you just get this ginormous bill...**" - Chamath Palihapitiya, All-In Podcast #263, March 2026
|
|
72
75
|
|
|
@@ -82,7 +85,7 @@ The pattern is simple: attempt the task, run checks, retry on failure, repeat. T
|
|
|
82
85
|
- it rolls back failed runs instead of leaving broken state behind
|
|
83
86
|
- it reduces runaway token growth with context distillation
|
|
84
87
|
|
|
85
|
-
If Ralph ever burned $165.70 on your dime, you're in the right place. Martin stopped him at $
|
|
88
|
+
If a Ralph-style loop has ever burned budget without producing a verified result, MartinLoop is designed to stop that failure mode before the next unsafe attempt runs. $165.70 on your dime, you're in the right place. Martin stopped him at $40.97 with a full audit trail.
|
|
86
89
|
|
|
87
90
|
<div align="center">
|
|
88
91
|
<img src="./docs/assets/martin-raplph.png.jpg" alt="Martin vs Ralph — governed vs ungoverned agent loop" width="240">
|
|
@@ -119,6 +122,8 @@ pnpm --filter @martin/benchmarks eval
|
|
|
119
122
|
pnpm --filter @martin/benchmarks eval:phase12
|
|
120
123
|
```
|
|
121
124
|
|
|
125
|
+
Challenge page: [Can your AI coding agent finish this task under $3?](./docs/distribution/UNDER-3-CHALLENGE.md)
|
|
126
|
+
|
|
122
127
|
---
|
|
123
128
|
|
|
124
129
|
## Quick Start
|
|
@@ -127,7 +132,9 @@ pnpm --filter @martin/benchmarks eval:phase12
|
|
|
127
132
|
npm install -g martin-loop
|
|
128
133
|
```
|
|
129
134
|
|
|
130
|
-
This installs both the `martin-loop` package and the `martin` command alias. The package is currently published on npm as version `0.1.
|
|
135
|
+
This installs both the `martin-loop` package and the `martin` command alias. The package is currently published on npm as version `0.1.4`.
|
|
136
|
+
|
|
137
|
+
Want a safe sandbox first? Run `npx martin-loop demo` and MartinLoop will copy a disposable local workspace into `./martin-loop-demo`.
|
|
131
138
|
|
|
132
139
|
### Public Package Surface
|
|
133
140
|
|
|
@@ -136,8 +143,23 @@ The frozen public package surface for this release candidate is:
|
|
|
136
143
|
- Install target: `npm install martin-loop`
|
|
137
144
|
- CLI target: `npx martin-loop`
|
|
138
145
|
- SDK target: `import { MartinLoop } from "martin-loop"`
|
|
146
|
+
- MCP target (registry-ready package): `npx -y @martinloop/mcp`
|
|
139
147
|
|
|
140
148
|
The `martin` command alias is installed for local operator convenience, but the public CLI surface is `npx martin-loop`.
|
|
149
|
+
The standalone MCP server package is smoke-validated locally with `pnpm --filter @martinloop/mcp smoke:pack` and is ready for registry publication as a separate release step.
|
|
150
|
+
|
|
151
|
+
### Claude Code MCP install
|
|
152
|
+
|
|
153
|
+
Use the published MCP package directly:
|
|
154
|
+
|
|
155
|
+
- macOS/Linux: `claude mcp add --scope user martin-loop -- npx -y @martinloop/mcp`
|
|
156
|
+
- Windows PowerShell/cmd: `claude mcp add --scope user martin-loop -- cmd /c "npx -y @martinloop/mcp"`
|
|
157
|
+
|
|
158
|
+
If you just want to launch the server manually, the one-line command is:
|
|
159
|
+
|
|
160
|
+
```sh
|
|
161
|
+
npx @martinloop/mcp
|
|
162
|
+
```
|
|
141
163
|
|
|
142
164
|
### Run a governed task
|
|
143
165
|
|
|
@@ -196,7 +218,7 @@ martin run <objective> [options]
|
|
|
196
218
|
--metadata <key=value> Attach metadata to the run record; repeatable
|
|
197
219
|
```
|
|
198
220
|
|
|
199
|
-
The public CLI also includes `inspect`, `resume`, and a `bench` redirect that points reviewers to the workspace benchmark harness.
|
|
221
|
+
The public CLI also includes `demo`, `inspect`, `resume`, and a `bench` redirect that points reviewers to the workspace benchmark harness.
|
|
200
222
|
|
|
201
223
|
<div align="center">
|
|
202
224
|
<img src="./docs/assets/cli-static.svg" alt="MartinLoop CLI terminal output" width="720">
|
|
@@ -287,20 +309,22 @@ The lower-level `runMartin` function is also exported for callers that want to a
|
|
|
287
309
|
| `@martin/core` | Runtime controller, policy engine, safety leash, grounding, persistence, and rollback logic. |
|
|
288
310
|
| `@martin/adapters` | Claude CLI, Codex CLI, direct-provider, and stub adapter surfaces. |
|
|
289
311
|
| `@martin/cli` | Local CLI implementation for `run`, `inspect`, `resume`, and the benchmark redirect. |
|
|
290
|
-
| `@
|
|
312
|
+
| `@martinloop/mcp` | MCP server tools: `martin_run`, `martin_inspect`, and `martin_status`. |
|
|
291
313
|
| `benchmarks/` | Workspace-only deterministic benchmark and RC validation harness. |
|
|
292
314
|
| `apps/control-plane/` | Hosted control-plane workstream, outside the initial npm package surface. |
|
|
293
315
|
| `apps/local-dashboard/` | Local dashboard/read-model viewer, not currently packaged as public npm API. |
|
|
294
316
|
|
|
295
|
-
The `@martin/core`, `@martin/adapters`, and `@martin/contracts` package manifests are still private workspace packages
|
|
317
|
+
The `@martin/core`, `@martin/adapters`, and `@martin/contracts` package manifests are still private workspace packages. The public runtime install target is the root `martin-loop` facade, while `@martinloop/mcp` is packaged as a standalone MCP server with vendored internal runtime dependencies for registry publication.
|
|
296
318
|
|
|
297
319
|
---
|
|
298
|
-
|
|
299
320
|
## Development
|
|
300
321
|
|
|
301
|
-
Requirements:
|
|
322
|
+
Requirements:
|
|
302
323
|
|
|
303
|
-
|
|
324
|
+
- Node 20+
|
|
325
|
+
- pnpm 10.x
|
|
326
|
+
|
|
327
|
+
```bash
|
|
304
328
|
git clone https://github.com/Keesan12/martin-loop.git
|
|
305
329
|
cd martin-loop
|
|
306
330
|
pnpm install
|
|
@@ -308,9 +332,9 @@ pnpm install
|
|
|
308
332
|
pnpm test
|
|
309
333
|
pnpm lint
|
|
310
334
|
pnpm build
|
|
335
|
+
|
|
311
336
|
```
|
|
312
337
|
|
|
313
|
-
```md
|
|
314
338
|
Current RC gate commands:
|
|
315
339
|
|
|
316
340
|
```sh
|
|
@@ -320,9 +344,7 @@ pnpm repo:smoke
|
|
|
320
344
|
pnpm rc:validate
|
|
321
345
|
pnpm pilot:prep:validate
|
|
322
346
|
pnpm release:matrix:local
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
This package is published through the public martin-loop package surface. Treat registry publication as a guarded release step: verify the RC gate commands, confirm the version follows semantic versioning, and document breaking changes before publishing.
|
|
347
|
+
```
|
|
326
348
|
|
|
327
349
|
> **Caution:** This package is live on npm. Treat registry publication as a guarded release step — verify the RC gate commands, confirm semantic versioning, and document breaking changes before publishing.
|
|
328
350
|
|
|
@@ -332,6 +354,11 @@ Helpful docs:
|
|
|
332
354
|
|
|
333
355
|
- [OSS quickstart](./docs/oss/QUICKSTART.md)
|
|
334
356
|
- [OSS examples](./docs/oss/EXAMPLES.md)
|
|
357
|
+
- [Under-$3 benchmark challenge](./docs/distribution/UNDER-3-CHALLENGE.md)
|
|
358
|
+
- [Directory submission pack](./docs/distribution/DIRECTORY-SUBMISSIONS.md)
|
|
359
|
+
- [Integration outreach pack](./docs/distribution/INTEGRATION-OUTREACH.md)
|
|
360
|
+
- [Claude Code walkthrough](./docs/oss/CLAUDE-CODE-WALKTHROUGH.md)
|
|
361
|
+
- [Ralph-style loop safety guide](./docs/oss/RALPH-LOOP-SAFETY.md)
|
|
335
362
|
- [OSS boundary report](./docs/oss/OSS-BOUNDARY-REPORT.md)
|
|
336
363
|
- [Release surface report](./docs/oss/RELEASE-SURFACE-REPORT.md)
|
|
337
364
|
|
|
@@ -360,3 +387,12 @@ Conventional commit prefixes: `feat:`, `fix:`, `chore:`, `docs:`, `refactor:`, a
|
|
|
360
387
|
*"AI coding accountability: completes good work, refuses unsafe work, stops uneconomical work."*
|
|
361
388
|
|
|
362
389
|
</div>
|
|
390
|
+
|
|
391
|
+
<div align="center">
|
|
392
|
+
<br>
|
|
393
|
+
<picture>
|
|
394
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/Keesan12/martin-loop/main/docs/assets/nvidia-inception-program.png">
|
|
395
|
+
<img src="https://raw.githubusercontent.com/Keesan12/martin-loop/main/docs/assets/nvidia-inception-program-light.png" alt="NVIDIA Inception Program logo" width="280">
|
|
396
|
+
</picture>
|
|
397
|
+
<br>
|
|
398
|
+
</div>
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# MartinLoop Demo Sandbox
|
|
2
|
+
|
|
3
|
+
This workspace is the safe public demo copied by `martin-loop demo`.
|
|
4
|
+
|
|
5
|
+
It is intentionally small:
|
|
6
|
+
|
|
7
|
+
- `npm test` is green out of the box
|
|
8
|
+
- `martin.config.yaml` keeps the budget tiny
|
|
9
|
+
- the first suggested MartinLoop run can stay in stub mode with `MARTIN_LIVE=false`
|
|
10
|
+
|
|
11
|
+
## Files
|
|
12
|
+
|
|
13
|
+
- `src/invoice-summary.js`: tiny module used by the demo task
|
|
14
|
+
- `test/invoice-summary.test.js`: Node test suite
|
|
15
|
+
- `TASKS.md`: suggested objectives for a stub-safe run or a live adapter run
|
|
16
|
+
- `martin.config.yaml`: low-risk governance defaults
|
|
17
|
+
|
|
18
|
+
## Suggested flow
|
|
19
|
+
|
|
20
|
+
```sh
|
|
21
|
+
npm install
|
|
22
|
+
npm test
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Safe first run:
|
|
26
|
+
|
|
27
|
+
```sh
|
|
28
|
+
MARTIN_LIVE=false npx martin-loop run "Summarize the demo workspace and confirm the verifier is green" --verify "npm test"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Optional live run:
|
|
32
|
+
|
|
33
|
+
```sh
|
|
34
|
+
npx martin-loop run "Add support for a discount percentage to summarizeInvoice and update the tests" --verify "npm test" --engine codex
|
|
35
|
+
```
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Suggested Demo Tasks
|
|
2
|
+
|
|
3
|
+
## Stub-safe first run
|
|
4
|
+
|
|
5
|
+
Use this when you want to see MartinLoop create a governed run record without spending provider budget:
|
|
6
|
+
|
|
7
|
+
```text
|
|
8
|
+
Summarize the demo workspace, confirm the verifier command is green, and explain the safest next change to make.
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Verifier:
|
|
12
|
+
|
|
13
|
+
```sh
|
|
14
|
+
npm test
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Optional live run
|
|
18
|
+
|
|
19
|
+
Use this when you want a real coding task in the sandbox:
|
|
20
|
+
|
|
21
|
+
```text
|
|
22
|
+
Add support for a discount percentage to summarizeInvoice and update the tests while keeping the existing tax behavior intact.
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Verifier:
|
|
26
|
+
|
|
27
|
+
```sh
|
|
28
|
+
npm test
|
|
29
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export function summarizeInvoice(items, taxRate = 0) {
|
|
2
|
+
const subtotal = items.reduce((sum, item) => sum + item.quantity * item.unitPrice, 0);
|
|
3
|
+
const tax = Number((subtotal * taxRate).toFixed(2));
|
|
4
|
+
const total = Number((subtotal + tax).toFixed(2));
|
|
5
|
+
|
|
6
|
+
return {
|
|
7
|
+
subtotal: Number(subtotal.toFixed(2)),
|
|
8
|
+
tax,
|
|
9
|
+
total
|
|
10
|
+
};
|
|
11
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import test from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
|
|
4
|
+
import { summarizeInvoice } from "../src/invoice-summary.js";
|
|
5
|
+
|
|
6
|
+
test("summarizeInvoice returns subtotal, tax, and total", () => {
|
|
7
|
+
const result = summarizeInvoice(
|
|
8
|
+
[
|
|
9
|
+
{ quantity: 2, unitPrice: 19.99 },
|
|
10
|
+
{ quantity: 1, unitPrice: 5.5 }
|
|
11
|
+
],
|
|
12
|
+
0.13
|
|
13
|
+
);
|
|
14
|
+
|
|
15
|
+
assert.deepEqual(result, {
|
|
16
|
+
subtotal: 45.48,
|
|
17
|
+
tax: 5.91,
|
|
18
|
+
total: 51.39
|
|
19
|
+
});
|
|
20
|
+
});
|
|
@@ -15,15 +15,18 @@ import type { MartinAdapter } from "../core/index.js";
|
|
|
15
15
|
import { type SpawnLike } from "./cli-bridge.js";
|
|
16
16
|
/**
|
|
17
17
|
* Given a prompt string, returns the full argv array to pass to spawn().
|
|
18
|
-
* Example for Claude: (
|
|
19
|
-
* Example for Codex: (
|
|
18
|
+
* Example for Claude: () => ["--output-format", "json", "--print"]
|
|
19
|
+
* Example for Codex: () => ["exec", "--sandbox", "workspace-write", "-"]
|
|
20
20
|
*/
|
|
21
21
|
export type CliArgsBuilder = (prompt: string) => string[];
|
|
22
|
+
export type CliStdinBuilder = (prompt: string) => string | undefined;
|
|
22
23
|
export interface AgentCliAdapterOptions {
|
|
23
24
|
/** The executable to spawn (e.g. "claude", "codex"). */
|
|
24
25
|
command: string;
|
|
25
26
|
/** Converts a prompt string into the argv array passed to spawn(). */
|
|
26
27
|
argsBuilder: CliArgsBuilder;
|
|
28
|
+
/** Optional stdin payload for CLIs that accept prompt input via stdin or `-`. */
|
|
29
|
+
stdinBuilder?: CliStdinBuilder;
|
|
27
30
|
/** Adapter ID suffix. Defaults to command. */
|
|
28
31
|
adapterIdSuffix?: string;
|
|
29
32
|
/** Working directory for all subprocesses. Defaults to process.cwd(). */
|
|
@@ -63,8 +66,16 @@ export interface CodexCliAdapterOptions {
|
|
|
63
66
|
label?: string;
|
|
64
67
|
/** Override the model passed via --model flag. */
|
|
65
68
|
model?: string;
|
|
66
|
-
/**
|
|
69
|
+
/**
|
|
70
|
+
* Deprecated no-op retained for compatibility.
|
|
71
|
+
*
|
|
72
|
+
* Codex CLI's supported non-interactive entrypoint is `codex exec`.
|
|
73
|
+
* MartinLoop now uses explicit sandboxing instead of the legacy
|
|
74
|
+
* `--full-auto` compatibility path, which can exit before verifier execution.
|
|
75
|
+
*/
|
|
67
76
|
fullAuto?: boolean;
|
|
77
|
+
/** Codex sandbox mode for model-generated commands. Defaults to workspace-write. */
|
|
78
|
+
sandbox?: "read-only" | "workspace-write" | "danger-full-access";
|
|
68
79
|
/** Extra args appended after core args (before prompt). */
|
|
69
80
|
extraArgs?: string[];
|
|
70
81
|
spawnImpl?: SpawnLike;
|
|
@@ -81,7 +92,11 @@ export declare function createAgentCliAdapter(options: AgentCliAdapterOptions):
|
|
|
81
92
|
*/
|
|
82
93
|
export declare function createClaudeCliAdapter(options?: ClaudeCliAdapterOptions): MartinAdapter;
|
|
83
94
|
/**
|
|
84
|
-
* Spawns `codex
|
|
95
|
+
* Spawns `codex exec --cd <workspace> --sandbox <mode> [--model <model>] [extraArgs] -`.
|
|
96
|
+
*
|
|
97
|
+
* The prompt is delivered via stdin so Windows shell quoting cannot truncate or
|
|
98
|
+
* reinterpret long MartinLoop prompts that contain paths, deny rules, or budget
|
|
99
|
+
* context.
|
|
85
100
|
*
|
|
86
101
|
* Requires the Codex CLI to be installed and authenticated:
|
|
87
102
|
* npm install -g @openai/codex
|
|
@@ -129,15 +129,12 @@ export function createAgentCliAdapter(options) {
|
|
|
129
129
|
}
|
|
130
130
|
}
|
|
131
131
|
const args = options.argsBuilder(prompt);
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
const useStdin = args.at(-1) === "--stdin-prompt";
|
|
135
|
-
const spawnArgs = useStdin ? args.slice(0, -1) : args;
|
|
136
|
-
const agentResult = await runSubprocess(options.command, spawnArgs, {
|
|
132
|
+
const stdinData = options.stdinBuilder?.(prompt);
|
|
133
|
+
const agentResult = await runSubprocess(options.command, args, {
|
|
137
134
|
cwd: workingDirectory,
|
|
138
135
|
timeoutMs,
|
|
139
136
|
spawnImpl: options.spawnImpl,
|
|
140
|
-
...(
|
|
137
|
+
...(stdinData === undefined ? {} : { stdinData })
|
|
141
138
|
});
|
|
142
139
|
if (agentResult.timedOut) {
|
|
143
140
|
return {
|
|
@@ -157,18 +154,19 @@ export function createAgentCliAdapter(options) {
|
|
|
157
154
|
};
|
|
158
155
|
}
|
|
159
156
|
if (agentResult.exitCode !== 0 && agentResult.stdout.trim().length === 0) {
|
|
157
|
+
const failureMessage = formatPreVerifierSubprocessFailure(options.command, agentResult.stderr, agentResult.exitCode);
|
|
160
158
|
return {
|
|
161
159
|
status: "failed",
|
|
162
|
-
summary: `${options.command} subprocess exited
|
|
160
|
+
summary: `${options.command} subprocess exited before verifier execution.`,
|
|
163
161
|
usage: normalizeUsage({
|
|
164
162
|
actualUsd: 0,
|
|
165
163
|
tokensIn: 0,
|
|
166
164
|
tokensOut: 0,
|
|
167
165
|
provenance: "unavailable"
|
|
168
166
|
}),
|
|
169
|
-
verification: { passed: false, summary:
|
|
167
|
+
verification: { passed: false, summary: `Verifier not run: ${failureMessage}` },
|
|
170
168
|
failure: {
|
|
171
|
-
message:
|
|
169
|
+
message: failureMessage
|
|
172
170
|
}
|
|
173
171
|
};
|
|
174
172
|
}
|
|
@@ -355,40 +353,52 @@ export function createClaudeCliAdapter(options = {}) {
|
|
|
355
353
|
"--print",
|
|
356
354
|
"--dangerously-skip-permissions",
|
|
357
355
|
...modelArgs,
|
|
358
|
-
...extraArgs
|
|
359
|
-
|
|
360
|
-
|
|
356
|
+
...extraArgs
|
|
357
|
+
],
|
|
358
|
+
stdinBuilder: (prompt) => prompt
|
|
361
359
|
});
|
|
362
360
|
}
|
|
363
361
|
// ---------------------------------------------------------------------------
|
|
364
362
|
// Pre-configured: OpenAI Codex CLI
|
|
365
363
|
// ---------------------------------------------------------------------------
|
|
366
364
|
/**
|
|
367
|
-
* Spawns `codex
|
|
365
|
+
* Spawns `codex exec --cd <workspace> --sandbox <mode> [--model <model>] [extraArgs] -`.
|
|
366
|
+
*
|
|
367
|
+
* The prompt is delivered via stdin so Windows shell quoting cannot truncate or
|
|
368
|
+
* reinterpret long MartinLoop prompts that contain paths, deny rules, or budget
|
|
369
|
+
* context.
|
|
368
370
|
*
|
|
369
371
|
* Requires the Codex CLI to be installed and authenticated:
|
|
370
372
|
* npm install -g @openai/codex
|
|
371
373
|
*/
|
|
372
374
|
export function createCodexCliAdapter(options = {}) {
|
|
373
|
-
const fullAuto = options.fullAuto !== false;
|
|
374
375
|
const modelArgs = options.model ? ["--model", options.model] : [];
|
|
375
376
|
const extraArgs = options.extraArgs ?? [];
|
|
377
|
+
const sandbox = options.sandbox ?? "workspace-write";
|
|
378
|
+
const workingDirectory = options.workingDirectory ?? process.cwd();
|
|
376
379
|
return createAgentCliAdapter({
|
|
377
380
|
command: "codex",
|
|
378
381
|
adapterIdSuffix: "codex",
|
|
379
382
|
model: options.model ?? "codex",
|
|
380
383
|
label: options.label ?? "Codex CLI adapter",
|
|
381
|
-
workingDirectory
|
|
384
|
+
workingDirectory,
|
|
382
385
|
timeoutMs: options.timeoutMs,
|
|
383
386
|
verifyTimeoutMs: options.verifyTimeoutMs,
|
|
384
387
|
supportsJsonOutput: false,
|
|
385
388
|
spawnImpl: options.spawnImpl,
|
|
386
|
-
argsBuilder: (
|
|
387
|
-
|
|
389
|
+
argsBuilder: () => [
|
|
390
|
+
"exec",
|
|
391
|
+
"--cd",
|
|
392
|
+
workingDirectory,
|
|
393
|
+
"--sandbox",
|
|
394
|
+
sandbox,
|
|
395
|
+
"--color",
|
|
396
|
+
"never",
|
|
388
397
|
...modelArgs,
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
]
|
|
398
|
+
...extraArgs,
|
|
399
|
+
"-"
|
|
400
|
+
],
|
|
401
|
+
stdinBuilder: (prompt) => prompt
|
|
392
402
|
});
|
|
393
403
|
}
|
|
394
404
|
// ---------------------------------------------------------------------------
|
|
@@ -402,14 +412,23 @@ export function createCodexCliAdapter(options = {}) {
|
|
|
402
412
|
// ---------------------------------------------------------------------------
|
|
403
413
|
function buildPrompt(request) {
|
|
404
414
|
const lines = [];
|
|
415
|
+
const mutationMode = request.context.mutationMode ?? "edit";
|
|
405
416
|
lines.push("You are running in autonomous agentic mode.");
|
|
406
|
-
|
|
407
|
-
|
|
417
|
+
if (mutationMode === "verify_only") {
|
|
418
|
+
lines.push("DO NOT EDIT FILES. Run the verifier only and report whether it passes.");
|
|
419
|
+
lines.push("Do not ask for confirmation. Do not ask clarifying questions.");
|
|
420
|
+
}
|
|
421
|
+
else {
|
|
422
|
+
lines.push("MAKE ALL REQUIRED FILE EDITS NOW. Do not ask for confirmation. Do not ask clarifying questions.");
|
|
423
|
+
lines.push("Do not explain what you found without also making the changes. Edit the files and complete the task.");
|
|
424
|
+
}
|
|
408
425
|
lines.push("");
|
|
409
426
|
lines.push("If PROGRESS.md exists in your working directory, read it first for context from prior attempts.");
|
|
410
427
|
lines.push("If it does not exist, proceed with the objective below.");
|
|
411
428
|
lines.push("");
|
|
412
|
-
lines.push(
|
|
429
|
+
lines.push(mutationMode === "verify_only"
|
|
430
|
+
? "Complete the following verification-only task without making file changes."
|
|
431
|
+
: "Complete the following coding task. Make all necessary file changes.");
|
|
413
432
|
lines.push("When you are done, the verification commands listed below must pass.");
|
|
414
433
|
lines.push("");
|
|
415
434
|
lines.push("OBJECTIVE:");
|
|
@@ -447,7 +466,9 @@ function buildPrompt(request) {
|
|
|
447
466
|
lines.push(` Attempt ${String(attemptNumber)}`);
|
|
448
467
|
lines.push(` Remaining budget: $${String(request.context.remainingBudgetUsd)} USD`);
|
|
449
468
|
lines.push(` Remaining iterations: ${String(request.context.remainingIterations)}`);
|
|
450
|
-
lines.push(
|
|
469
|
+
lines.push(mutationMode === "verify_only"
|
|
470
|
+
? " Do not modify files; only run verification."
|
|
471
|
+
: " Do not expand scope beyond what is needed to pass verification.");
|
|
451
472
|
lines.push("");
|
|
452
473
|
if (request.previousAttempts.length > 0) {
|
|
453
474
|
lines.push("PRIOR FAILED ATTEMPTS (learn from these — do not repeat the same mistakes):");
|
|
@@ -494,6 +515,16 @@ function truncate(text, maxLength) {
|
|
|
494
515
|
}
|
|
495
516
|
return `...${text.slice(-(maxLength - 3))}`;
|
|
496
517
|
}
|
|
518
|
+
function formatPreVerifierSubprocessFailure(command, stderr, exitCode) {
|
|
519
|
+
const detail = stderr.trim() || `Exit code ${String(exitCode)}`;
|
|
520
|
+
const lowerDetail = detail.toLowerCase();
|
|
521
|
+
const codexLaunchBlocked = command === "codex" &&
|
|
522
|
+
/\b(full-auto|sandbox|approval|permission|trusted|safety|unexpected argument)\b/u.test(lowerDetail);
|
|
523
|
+
if (codexLaunchBlocked) {
|
|
524
|
+
return `Codex CLI failed before patch completion, likely due to its launch/sandbox configuration. MartinLoop invokes Codex through "codex exec --sandbox workspace-write"; verify Codex CLI auth and configuration if this persists. ${detail}. environment_mismatch`;
|
|
525
|
+
}
|
|
526
|
+
return `${detail}. environment_mismatch`;
|
|
527
|
+
}
|
|
497
528
|
const INJECTION_PATTERNS = [
|
|
498
529
|
/\[INST\]/gi,
|
|
499
530
|
/<\/?system>/gi,
|