@loops-adk/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +486 -0
- package/bin/loops.mjs +16 -0
- package/dist/App-3YQS6DXA.js +461 -0
- package/dist/App-3YQS6DXA.js.map +1 -0
- package/dist/agent-sdk-RF5VJZAT.js +95 -0
- package/dist/agent-sdk-RF5VJZAT.js.map +1 -0
- package/dist/anthropic-api-XJY6Y4T2.js +131 -0
- package/dist/anthropic-api-XJY6Y4T2.js.map +1 -0
- package/dist/api.d.ts +949 -0
- package/dist/api.js +898 -0
- package/dist/api.js.map +1 -0
- package/dist/chunk-33YIGWNU.js +63 -0
- package/dist/chunk-33YIGWNU.js.map +1 -0
- package/dist/chunk-3BPU34DE.js +2163 -0
- package/dist/chunk-3BPU34DE.js.map +1 -0
- package/dist/chunk-CXEPZHSR.js +86 -0
- package/dist/chunk-CXEPZHSR.js.map +1 -0
- package/dist/chunk-I3STY7U6.js +61 -0
- package/dist/chunk-I3STY7U6.js.map +1 -0
- package/dist/chunk-JFTXJ7I2.js +18 -0
- package/dist/chunk-JFTXJ7I2.js.map +1 -0
- package/dist/chunk-XC46B4FD.js +9 -0
- package/dist/chunk-XC46B4FD.js.map +1 -0
- package/dist/chunk-Y2SD7GBL.js +30 -0
- package/dist/chunk-Y2SD7GBL.js.map +1 -0
- package/dist/claude-cli-U7WEVAOL.js +124 -0
- package/dist/claude-cli-U7WEVAOL.js.map +1 -0
- package/dist/codex-6I5UZ2HM.js +60 -0
- package/dist/codex-6I5UZ2HM.js.map +1 -0
- package/dist/env/command.d.ts +53 -0
- package/dist/env/command.js +3 -0
- package/dist/env/command.js.map +1 -0
- package/dist/env/docker.d.ts +38 -0
- package/dist/env/docker.js +33 -0
- package/dist/env/docker.js.map +1 -0
- package/dist/env/sst.d.ts +39 -0
- package/dist/env/sst.js +20 -0
- package/dist/env/sst.js.map +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +620 -0
- package/dist/index.js.map +1 -0
- package/dist/types-B4wGVpqo.d.ts +898 -0
- package/package.json +100 -0
- package/skills/author-loop/SKILL.md +121 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jonny Neill
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
# loops
|
|
2
|
+
|
|
3
|
+
**Stop prompting agents. Write the loop that prompts them. Make "done" mean _converged_, not _claimed_.**
|
|
4
|
+
|
|
5
|
+
`loops` is a small, nestable library for running an agent in a convergence loop. The loop finds the work, hands it to an agent, checks the result, records what it learned, and goes again until a gate _you_ define says the work is finished. You write the loop once and it drives the agent, rather than prompting the agent by hand. Compose loops and DAGs both ways, run them against any model behind a one-method `Engine`, and watch a run in a live terminal UI.
|
|
6
|
+
|
|
7
|
+
Every iteration runs with a **fresh context**, so a long run never rots. Progress accumulates in **git, not the chat transcript**: the agent forgets between turns, the repository does not. The loop stops only when an **honest gate** clears, a deterministic check (the tests genuinely pass) alongside a separate judge in its own context, so the model that did the work is never the one that grades it. The gate is the core idea. It keeps a loop from declaring itself finished on a half-built job and spending tokens with nothing to show.
|
|
8
|
+
|
|
9
|
+
Where most "agent memory" recalls a _conversation_, this keeps your _decisions_ consistent across long work. No vector database, no embeddings, no index to sync or let go stale. **Git is the memory.**
|
|
10
|
+
|
|
11
|
+

|
|
12
|
+

|
|
13
|
+

|
|
14
|
+

|
|
15
|
+
|
|
16
|
+
```ts
|
|
17
|
+
import { loop, agentJob, commandSucceeds, agentCheck } from '@loops-adk/core';
|
|
18
|
+
|
|
19
|
+
// Keep working until the tests pass AND a judge agrees it matches intent.
|
|
20
|
+
export default loop({
|
|
21
|
+
name: 'build-feature',
|
|
22
|
+
max: 20,
|
|
23
|
+
body: agentJob({
|
|
24
|
+
prompt: (c) => `Iteration ${c.iteration}: make concrete progress on TASK.md.`,
|
|
25
|
+
ground: true, // read the commit log + this run's scratch files before working
|
|
26
|
+
}),
|
|
27
|
+
until: [
|
|
28
|
+
commandSucceeds('npm', ['test']), // ground truth
|
|
29
|
+
agentCheck({ question: 'Does it match TASK.md?', threshold: 0.85 }), // intent
|
|
30
|
+
],
|
|
31
|
+
commit: { subject: 'feat: TASK.md' }, // one milestone commit when it converges
|
|
32
|
+
});
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Install
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
npm i @loops-adk/core # Node >= 20
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Write a loop in a `.loop.ts` file, then run it. `loops run` works from any repo that has the package installed:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
loops validate your-feature.loop.ts # offline pre-flight: prints the loop's shape, no model calls
|
|
47
|
+
loops run your-feature.loop.ts # run it (live TUI; add --no-tui or --json for headless)
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
The full CLI, the flags-only mode (no file), and the offline demo are in [Quick start](#quick-start) below.
|
|
51
|
+
|
|
52
|
+
## A whole engineering team, defined as files
|
|
53
|
+
|
|
54
|
+
The primitives compose into something bigger than a single loop: an **engineering team** that builds a multi-component service, holds it coherent across components, and converges only when each piece clears a bar one agent can't impose on itself: a report-only **review battery** of distinct lenses, including a genuinely different model.
|
|
55
|
+
|
|
56
|
+
```ts
|
|
57
|
+
// Five report-only lenses, each a markdown persona that closes with `<confidence>N%</confidence>`.
|
|
58
|
+
// The adversarial lens runs on a DIFFERENT model (codex / GPT-5): any reviewer, any model.
|
|
59
|
+
const battery = (name) =>
|
|
60
|
+
reviewPanel(name, [
|
|
61
|
+
['adversarial', { engine: 'codex' }], // genuinely different priors
|
|
62
|
+
['security', { model: 'opus' }],
|
|
63
|
+
['correctness', { model: 'sonnet' }],
|
|
64
|
+
['conformance', { model: 'opus' }],
|
|
65
|
+
['simplicity', { model: 'haiku' }],
|
|
66
|
+
]);
|
|
67
|
+
|
|
68
|
+
const engineer = (name) =>
|
|
69
|
+
loop({
|
|
70
|
+
name,
|
|
71
|
+
body: agentJob({ agent: engineerFor(name), prompt: brief(name), ground: true }),
|
|
72
|
+
until: commandSucceeds('node', [`test-${name}.mjs`]), // deterministic truth
|
|
73
|
+
review: battery(name), // unanimous; a failing review hands its findings to the next iteration
|
|
74
|
+
commit: true,
|
|
75
|
+
max: 8,
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
export default dag({
|
|
79
|
+
name: 'build-service',
|
|
80
|
+
nodes: {
|
|
81
|
+
store: engineer('store'),
|
|
82
|
+
api: { needs: ['store'], job: engineer('api') },
|
|
83
|
+
serialize: { needs: ['store'], isolate: true, job: engineer('serialize') }, // parallel worktree
|
|
84
|
+
client: { needs: ['api', 'serialize'], job: engineer('client') },
|
|
85
|
+
},
|
|
86
|
+
isolation: 'worktree',
|
|
87
|
+
});
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
The `dag` is the manager (toposort + dispatch). Each node is a Converge loop: the engineer builds to its `test` (`until`), then the **review battery** runs in the `review` slot: five report-only lenses with near-disjoint blind spots, each judging the actual source against the recorded contracts and closing with a `<confidence>N%</confidence>`. Because a reviewer is just an `AgentDef` and `agentCheck` takes an `engine` and `model`, **any reviewer runs on any model**: the adversarial lens on codex (GPT-5) for a true second-model signal, the rest spread across Claude. A failing review is not a dead end: its findings thread into the next iteration as `lastReview`, so the engineer fixes concrete concerns: the build → review → fix-up loop, with no human in it. `isolate` runs engineers in parallel worktrees that land back on pass; `ground: true` carries the contracts only `store` decides (stable ids, the `SSv1|` wire tag) to the engineers and reviewers downstream.
|
|
91
|
+
|
|
92
|
+
A single autonomous agent grades its own homework. This team **structurally cannot**: "done" means past an independent, multi-lens, multi-model review battery it never applies to itself. That enforced honest-convergence gate is the deepest idea here; memory is one free pillar underneath it. The whole team (engineers and reviewers) is a folder of markdown personas plus the wiring above, runnable in [`examples/build-service.loop.ts`](examples/build-service.loop.ts).
|
|
93
|
+
|
|
94
|
+
## Why loops?
|
|
95
|
+
|
|
96
|
+
Agents rarely nail it in one shot. The reliable pattern is a **convergence loop**: do a bit of work, check whether you're _actually_ done, and if not, go again. Two things make or break it, and `loops` is built around both:
|
|
97
|
+
|
|
98
|
+
- **A fresh context every turn.** Long-running agents rot as their history balloons. `loops` runs each iteration with a clean slate and lets progress accumulate where it belongs: in the **workspace** (files, git commits), not in a chat transcript. The loop carries only thin bookkeeping.
|
|
99
|
+
- **Memory in git, not in the transcript.** Fresh context alone would mean amnesia. **Ledger** (below) writes the _why_ to git as the work happens and reads it back before the next turn, so a clean slate is never a blank one.
|
|
100
|
+
- **A real done-check.** "Ask the model if it's finished" is the classic trap: the model grades its own homework. `loops` makes the gate a first-class value and lets you combine a **deterministic** signal (the tests genuinely pass) with a **separate judge**, so "done" means _converged_, not _claims to be_.
|
|
101
|
+
|
|
102
|
+
Everything else (DAGs, nesting, engines, budgets, the TUI) hangs off those ideas. The whole thing is small enough to read in an afternoon.
|
|
103
|
+
|
|
104
|
+
## What loops does differently
|
|
105
|
+
|
|
106
|
+
A loop is easy to start and hard to keep honest. Four parts decide whether it earns its cost, and `loops` is built around them.
|
|
107
|
+
|
|
108
|
+
| The hard part | In `loops` |
|
|
109
|
+
| --- | --- |
|
|
110
|
+
| **The gate.** Knowing the work is actually done, not just that the agent stopped. | A deterministic check (`commandSucceeds`) and a separate judge (`agentCheck`) in its own context, hardened with a k-of-n `quorum` and a geometric-mean rubric so one weak dimension sinks the verdict. The model that did the work never grades it. |
|
|
111
|
+
| **Memory.** Carrying what was learned across a run without dragging a transcript along. | The git commit log is the memory: a structured handoff per milestone, read back before the next turn. No `STATE.md` the model is trusted to keep tidy, no vector store to sync. |
|
|
112
|
+
| **Parallelism.** Running several agents without collisions on the same files. | `isolation: 'worktree'` gives each writer its own branch and worktree, landed back on pass with a `--no-ff` merge. |
|
|
113
|
+
| **Hard stops.** Bounding a loop so it cannot run forever or empty your account. | `max` caps iterations and `budget` caps tokens, a non-retryable stop the engine calls refuse to cross. |
|
|
114
|
+
|
|
115
|
+
Three things `loops` does that most loop libraries do not:
|
|
116
|
+
|
|
117
|
+
- **Nesting is a primitive.** `loop()` and `dag()` both return a `Job`, so loops nest inside DAGs and DAGs nest inside loops, to any depth. Orchestrating many loops is one expression, not a separate harness.
|
|
118
|
+
- **Memory survives a squash merge.** A squash merge flattens a branch's commit bodies into a list of subject lines and loses the reasoning. `pullRequestJob` and `mergeJob` keep the squashed commit body a consolidation of the branch.
|
|
119
|
+
- **It runs against any model or tool.** The agent launch only touches a one-method `Engine`, so the same loop runs on Claude, on a different model, or on your own provider, unchanged.
|
|
120
|
+
|
|
121
|
+
Two parts are deliberately out of scope. The heartbeat that fires a loop on a schedule belongs in cron, GitHub Actions, or a workflow engine, with a `loops` job inside. Acting in external tools is the agent's own job through its tools. `loops` is the body of the loop, kept small.
|
|
122
|
+
|
|
123
|
+
## From source
|
|
124
|
+
|
|
125
|
+
> **Status: alpha**, the API is still settling. To work on `loops` or run it from a checkout:
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
git clone https://github.com/jonny981/loops.git
|
|
129
|
+
cd loops
|
|
130
|
+
npm install
|
|
131
|
+
node bin/loops.mjs --help # or: npm link → loops --help
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Requires **Node ≥ 20**. Running from a checkout needs no build step: the CLI runs the TypeScript source directly through [`tsx`](https://github.com/privatenumber/tsx).
|
|
135
|
+
|
|
136
|
+
## Quick start
|
|
137
|
+
|
|
138
|
+
**Flags mode**, the standard `worker → until → review` loop, no code:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
loops run \
|
|
142
|
+
--prompt "Continue implementing the feature in TASK.md; report what changed." \
|
|
143
|
+
--engine claude-cli \
|
|
144
|
+
--until "Is the feature fully implemented with passing tests?" --threshold 0.85 \
|
|
145
|
+
--review "Does it pass a strict review with no blockers?" \
|
|
146
|
+
--max 20
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Definition-file mode**: full power and nesting. A `.loop.ts` file `export default`s a `Job`:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
loops validate examples/confidence-gate.loop.ts # offline pre-flight: load + print the shape, no model calls
|
|
153
|
+
loops describe examples/confidence-gate.loop.ts # print the loop's shape (gate, body, nodes) without running
|
|
154
|
+
loops run examples/confidence-gate.loop.ts # live Ink TUI
|
|
155
|
+
loops run examples/confidence-gate.loop.ts --no-tui # plain streamed logs
|
|
156
|
+
loops run examples/confidence-gate.loop.ts --json # NDJSON event stream
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
> `loops run <file>` **imports and executes** that file's module, like `node <file>`. Only run definition files you trust.
|
|
160
|
+
|
|
161
|
+
**Authoring is agent-native.** Both commands work from any repo, including one that consumes `loops` as a submodule or dependency (the recipe's folder just needs an ES module scope, which such repos already have). `loops validate <file>` is the cheap, no-model pre-flight an agent runs before `loops run`: it loads the loop, reports a fix-oriented error if anything is wrong, and prints the loop's shape (its gate, body, and dag nodes), all without spending a single agent turn. `loops describe <file>` prints that same shape on its own, so an agent can see exactly what it just authored. The authoring guide an agent reads to compose a loop is [`skills/author-loop/SKILL.md`](skills/author-loop/SKILL.md).
|
|
162
|
+
|
|
163
|
+
**Offline demo** (no network, no key; uses the mock engine):
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
npm run example:poll
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Core idea: everything is a `Job`
|
|
170
|
+
|
|
171
|
+
There is one universal unit of work, and two supporting types:
|
|
172
|
+
|
|
173
|
+
```ts
|
|
174
|
+
type Job = (ctx: JobContext) => Promise<Outcome>; // a unit of work, any size
|
|
175
|
+
type Condition = (ctx, last) => Promise<{ met; reason; confidence? }>; // a yes/no gate
|
|
176
|
+
interface Engine {
|
|
177
|
+
run(req, onEvent, signal): Promise<AgentResult>;
|
|
178
|
+
} // where an agent turn runs
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
- **`loop()` returns a `Job`**, so a loop nests by passing one as another's `body` or `review`.
|
|
182
|
+
- **`dag()` returns a `Job` too**, so loops and DAGs nest **both ways**: a DAG node can be a loop, a loop body can be a DAG.
|
|
183
|
+
|
|
184
|
+
Nesting is the absence of a special case, not a feature.
|
|
185
|
+
|
|
186
|
+
### `loop(config)`
|
|
187
|
+
|
|
188
|
+
```ts
|
|
189
|
+
loop({
|
|
190
|
+
name: 'build-feature',
|
|
191
|
+
body, // the Job run each iteration (fresh context); pass a loop()/dag() to nest
|
|
192
|
+
start, // gate before iterating; unmet ⇒ aborted
|
|
193
|
+
until, // checked after each body; met ⇒ stop (then review)
|
|
194
|
+
stopOn, // hard early-exit each iteration; met ⇒ aborted
|
|
195
|
+
review, // runs when until is met; non-pass re-enters the loop (folds back as ctx.lastReview)
|
|
196
|
+
max, // iteration cap; reached without passing ⇒ exhausted
|
|
197
|
+
maxReviewRestarts, // cap the worker/reviewer standoff independently of max
|
|
198
|
+
delayMs, // delay between iterations (polling); interruptible by abort
|
|
199
|
+
retry, // { onError: 'continue' | 'fail', maxConsecutive?, backoffMs? }
|
|
200
|
+
onIteration,
|
|
201
|
+
onComplete, // hooks (onComplete runs once, whatever the outcome)
|
|
202
|
+
});
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
With no `until`, a `pass` body ends the loop. Terminal status is one of `pass · fail · exhausted · aborted · paused` (CLI exit codes `0 · 1 · 2 · 130 · 75`). `paused` is a limit-driven, resumable stop. See [Rate limits, quotas, and budgets](#rate-limits-quotas-and-budgets-wait-or-resume).
|
|
206
|
+
|
|
207
|
+
## Conditions: honest convergence
|
|
208
|
+
|
|
209
|
+
`start` / `until` / `stopOn` accept **one item or many**, freely mixing deterministic predicates and agent judges. Arrays are `all` by default (wrap in `any(...)` for or):
|
|
210
|
+
|
|
211
|
+
```ts
|
|
212
|
+
until: [
|
|
213
|
+
commandSucceeds('npm', ['test']), // deterministic ground truth
|
|
214
|
+
agentCheck({ question: 'Good enough to ship?', threshold: 0.9 }), // agent-validated intent
|
|
215
|
+
];
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
Prefer this mixed form over a lone judge. A model's self-reported confidence is a weak, poorly-calibrated signal. Treat it as a guard on _intent_, with a deterministic check as the _truth_. Two ways to harden the judge itself:
|
|
219
|
+
|
|
220
|
+
```ts
|
|
221
|
+
// k-of-n jury: consensus, not one number
|
|
222
|
+
quorum(2, judgeA, judgeB, judgeC);
|
|
223
|
+
|
|
224
|
+
// one judge, multiple dimensions: opens on the GEOMETRIC MEAN,
|
|
225
|
+
// so a single weak dimension drags the verdict down
|
|
226
|
+
agentCheck({
|
|
227
|
+
question: 'Ready to ship?',
|
|
228
|
+
threshold: 0.8,
|
|
229
|
+
dimensions: ['intent match', 'evidence quality', 'outcome coherence'],
|
|
230
|
+
});
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
**Builders:** `predicate`, `bodyPassed`, `minConfidence`, `commandSucceeds` (a shell command exits 0), `all`, `any`, `not`, `quorum` (k-of-n), `agentCheck` (small-model judge), `always`, `never`, and `gateJob` (lift a condition into a `Job`, e.g. a reviewer).
|
|
234
|
+
|
|
235
|
+
## Ledger: memory built on git
|
|
236
|
+
|
|
237
|
+
Fresh context kills _rot_; on its own it would cause _amnesia_. **Ledger** is the core that closes the gap: the loop writes its reasoning to git as it works and reads it back before the next turn. No parallel database, no vector store; git _is_ the index: nothing to build, embed, sync, or let go stale (the commit log can't drift out of sync with the code; it _is_ the code's history). (`Ledger` is the engine; the **commit log** is the durable memory it reads and writes; `.loops/ledger.md` and `.loops/prompt.md` are the live scratch files for work in flight.)
|
|
238
|
+
|
|
239
|
+
The three tiers below form a progression. The scratch files record what failed and what was tried. The gate turns a fix into a verified fact. The milestone commit distills it into a durable decision. Grounding lets the next turn read that decision instead of re-deriving it.
|
|
240
|
+
|
|
241
|
+
- **Scratch files: working memory and a handoff.** Two gitignored files carry a unit of work forward. `.loops/ledger.md` is **working memory** for the agent(s) doing the work now: the harness auto-captures each grounded turn (the reasoning + a summary of actions), so the why is recorded even when no single agent holds it all at the end, and fanned-out peers share it. `.loops/prompt.md` is the **handoff** the agent distils for whoever continues: intent, alternatives ruled out, constraints, what is left. Grounding injects both into the next context; the commit body is the handoff plus a compacted working log.
|
|
242
|
+
|
|
243
|
+
```ts
|
|
244
|
+
appendPrompt(ctx.workspace, { heading: 'Why', body: 'tried a token refresh; the gate still failed on scope' });
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
- **Milestone commits: crystallise it.** A commit is a _milestone_, not an iteration. When a loop converges, `commitJob` composes one structured body, the handoff plus a compacted working log (the **way**), welded to the diff (the **what**), then clears both scratch files. Turn it on with `commit:`; iterations stay durable in the workspace + scratch files, so the log holds only converged, reasoned-over checkpoints. Welded to its diff, a commit body is a permanent record any later agent can look back to, as far back as it wants. Finer milestones? Compose finer loops/nodes.
|
|
248
|
+
|
|
249
|
+
```ts
|
|
250
|
+
loop({ name: 'build', body, until, commit: { subject: 'feat: the feature' } });
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
- **Grounding: read it back.** A fresh turn reads the recent committed commit log (past milestones) and this run's live scratch files (working memory + handoff), prepended to its prompt, so it knows what was already tried. The reach is **branch-local**: adjacent branches are in-flight and may never land, and the merge is where work becomes shared truth.
|
|
254
|
+
|
|
255
|
+
```ts
|
|
256
|
+
agentJob({ label: 'work', prompt: 'Continue the task.', ground: true });
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
- **Scaling the read: retrieval, then consolidation.** Recent-N grounding is the default, but on a long, noisy log the relevant commit falls out of the window. `ground: { retrieve: true }` has a cheap model select the relevant commits by subject instead. Use it for long-horizon work. For an indefinite process, `consolidateJob` folds the history into a **decision-preserving consolidated ledger**: a bounded record that keeps every accrued decision verbatim (a naive progress summary loses the specifics), committed as a commit body (the coarse tier, grounded like any milestone, never a side file). Retrieval finds the _relevant_ past commits; consolidation keeps _all the decisions_ in bounded space: different jobs, both in the git grain.
|
|
260
|
+
|
|
261
|
+
```ts
|
|
262
|
+
agentJob({ label: 'work', prompt: 'Continue.', ground: { retrieve: true } });
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
- **Ship via PR: survive the squash.** The commit log is the memory, but a **squash merge** collapses a branch's milestone bodies into one commit whose body defaults to a list of subject lines, the reasoning lost from the base branch. `pullRequestJob` closes that: it pushes the branch and opens (or idempotently updates) a PR whose body is the same `consolidate` fold scoped to this branch, kept current as milestones land. `mergeJob` then squash-merges with that synthesis as the commit body, gated on CI (`auto: true` hands the wait to GitHub; `when: forgeChecks()` is a synchronous gate). The host is the injectable `Forge` seam (the `gh` CLI by default), so it runs offline against a `MockForge`.
|
|
266
|
+
|
|
267
|
+
```ts
|
|
268
|
+
sequence('ship', pullRequestJob({ base: 'main' }), mergeJob({ base: 'main', auto: true }));
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
The Ledger has **two faces**: _cross-iteration_ (recover from your own failed attempts in a retry loop) and _cross-node_ (honour an upstream node's decision a downstream agent could not otherwise know). Both need headroom. On one-shot, single-node work memory is only a tax. See [docs/concepts.md](docs/concepts.md) for where it helps and the measured evidence in [bench/RESULTS.md](bench/RESULTS.md).
|
|
272
|
+
|
|
273
|
+
## Engines: bring any model
|
|
274
|
+
|
|
275
|
+
The agent launch only ever touches the `Engine` interface, so the loop knows nothing about your model, provider, or framework.
|
|
276
|
+
|
|
277
|
+
| name | backend | notes |
|
|
278
|
+
| --------------- | -------------------------------- | ----------------------------------------------------------- |
|
|
279
|
+
| `claude-cli` | `claude` subprocess (`execa`) | fresh process per call; uses host Claude auth, no key |
|
|
280
|
+
| `agent-sdk` | `@anthropic-ai/claude-agent-sdk` | fresh `query()` per call; host Claude auth |
|
|
281
|
+
| `anthropic-api` | `@anthropic-ai/sdk` | token-level streaming; cheapest for judges; needs a key |
|
|
282
|
+
| `codex` | `codex exec` subprocess (GPT-5) | a genuinely different model for a second-model reviewer; read-only |
|
|
283
|
+
| `mock` | scripted, offline | for tests and examples |
|
|
284
|
+
|
|
285
|
+
Select per-run (`--engine`, `RunOptions.engine`) or per-job/condition (`engine:` takes a name **or** a ready-made `Engine`). Bring your own in ~10 lines:
|
|
286
|
+
|
|
287
|
+
```ts
|
|
288
|
+
import { run, type Engine } from '@loops-adk/core';
|
|
289
|
+
|
|
290
|
+
const myEngine: Engine = {
|
|
291
|
+
name: 'my-provider',
|
|
292
|
+
async run(req, onEvent, signal) {
|
|
293
|
+
// call any provider/framework; stream tokens via onEvent({ type: 'text', delta })
|
|
294
|
+
return { text, usage: { inputTokens, outputTokens }, model: req.model ?? 'x' };
|
|
295
|
+
},
|
|
296
|
+
};
|
|
297
|
+
|
|
298
|
+
await run(job, { engine: 'my-provider', engines: { 'my-provider': myEngine } });
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
That's the whole contract: implement `run`, register a name. A managed/durable runner could be a drop-in engine too.
|
|
302
|
+
|
|
303
|
+
## Agents: define a specialist once
|
|
304
|
+
|
|
305
|
+
Instead of a wall of inline prompt, define each agent as a reusable, job-specific **`AgentDef`**: the persona and methodologies live in editable **markdown files**, the structure and types live in TypeScript. The `.ts` is the strongly-typed wrapper around the `.md`:
|
|
306
|
+
|
|
307
|
+
```ts
|
|
308
|
+
import { defineAgent, defineSkill, fromFile, agentJob } from '@loops-adk/core';
|
|
309
|
+
|
|
310
|
+
const tdd = defineSkill({ name: 'tdd', instructions: fromFile(new URL('./skills/tdd.md', import.meta.url)) });
|
|
311
|
+
|
|
312
|
+
const storeEngineer = defineAgent({
|
|
313
|
+
name: 'store-engineer',
|
|
314
|
+
system: fromFile(new URL('./agents/store-engineer.md', import.meta.url)), // the persona, as markdown
|
|
315
|
+
model: 'sonnet',
|
|
316
|
+
tools: ['edit', 'bash'],
|
|
317
|
+
capabilities: ['storage engine', 'id stability'],
|
|
318
|
+
skills: [tdd], // methodologies fold into the system
|
|
319
|
+
failureModes: [{ mode: 'tests-flaky', recovery: 'isolate the flake, retry once' }],
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
agentJob({ agent: storeEngineer, prompt: 'Build the store to its tests.', ground: true });
|
|
323
|
+
```
|
|
324
|
+
|
|
325
|
+
`agentJob` resolves the def into the engine request (`system` = persona + skills, plus `model`/`tools`); inline `system`/`model`/`tools` still override it. A **skill** is a methodology (how to work: TDD, writing-plans), not a worker. This is what turns a `dag` into a named **team** (`storeEngineer`, `apiEngineer`, `securityReviewer` as small files) orchestrated by the DAG and gated by `quorum(...)`.
|
|
326
|
+
|
|
327
|
+
## Environments: test the running thing
|
|
328
|
+
|
|
329
|
+
A gate is only as honest as what it tests. `commandSucceeds('npm', ['test'])` checks files on disk; to check that the thing _works_ you need it running. The **Environment** axis is where code runs (local services or a per-branch cloud preview), so `until` can gate on the live preview, not just static files. It is the third provider axis:
|
|
330
|
+
|
|
331
|
+
| Axis | Where it… | Lives in |
|
|
332
|
+
| ------------- | --------------- | --------------------- |
|
|
333
|
+
| `Engine` | the agent thinks | model / provider |
|
|
334
|
+
| `Workspace` | the code lives | worktree + branch |
|
|
335
|
+
| `Environment` | the code runs | local / cloud preview |
|
|
336
|
+
|
|
337
|
+
Like `Engine`, loops owns only the interface and the lifecycle binding; the adapter (sst, Vercel, Docker…) is yours and lives next to the deploy config it wraps; loops never depends on a deploy tool. The handle's `env` (e.g. `BASE_URL`) is injected into gate commands, so the done-check reaches the live preview.
|
|
338
|
+
|
|
339
|
+
```ts
|
|
340
|
+
import { run, loop, commandSucceeds, type Environment } from '@loops-adk/core';
|
|
341
|
+
|
|
342
|
+
const sstEnv: Environment = {
|
|
343
|
+
name: 'sst',
|
|
344
|
+
async up(ws) {
|
|
345
|
+
const url = await deployStage(slug(ws.branch), ws.dir); // your deploy
|
|
346
|
+
return { url, env: { BASE_URL: url }, down: () => removeStage(slug(ws.branch)) };
|
|
347
|
+
},
|
|
348
|
+
};
|
|
349
|
+
|
|
350
|
+
const job = loop({ name: 'build', body, until: commandSucceeds('playwright', ['test']) });
|
|
351
|
+
await run(job, { environment: sstEnv }); // one env for the run…
|
|
352
|
+
// …or DagConfig.environment to give every worktree-team its own stage, named after its branch.
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
Environments are **optional**: a research pipeline that never deploys just leaves it unset, and the gates test files and commands without a `BASE_URL`.
|
|
356
|
+
|
|
357
|
+
**Built-in adapters** (opt-in subpaths, no added dependency; they shell out to the CLI on PATH):
|
|
358
|
+
|
|
359
|
+
- `@loops-adk/core/env/command`: `commandEnvironment`, the generic factory every IaC tool fits (deploy / read outputs / destroy). sst, terraform, pulumi, and cloudformation-via-aws-cli are all thin presets over it.
|
|
360
|
+
- `@loops-adk/core/env/sst`: `sstEnvironment`, a per-branch sst stage (`sst deploy --stage <branch>`).
|
|
361
|
+
- `@loops-adk/core/env/docker`: `dockerEnvironment`, a local stack via a per-branch Docker Compose project, with ephemeral-port discovery so parallel branches never collide.
|
|
362
|
+
|
|
363
|
+
SDK-bound adapters (e.g. the AWS SDK) add a real dependency, so they belong in your own package or loop definition, not the core.
|
|
364
|
+
|
|
365
|
+
## Composition: loops and DAGs
|
|
366
|
+
|
|
367
|
+
```ts
|
|
368
|
+
import { dag, sequence, parallel, loop, agentJob, gateJob, agentCheck } from '@loops-adk/core';
|
|
369
|
+
|
|
370
|
+
dag({
|
|
371
|
+
name: 'ship',
|
|
372
|
+
concurrency: 2,
|
|
373
|
+
nodes: {
|
|
374
|
+
research: agentJob({ label: 'research', prompt: '…' }),
|
|
375
|
+
implement: { needs: ['research'], job: loop({ /* … a loop as a node */ }) },
|
|
376
|
+
test: { needs: ['implement'], job: agentJob({ label: 'test', prompt: '…' }) },
|
|
377
|
+
review: { needs: ['test'], job: gateJob('review', agentCheck({ /* … */ })) },
|
|
378
|
+
},
|
|
379
|
+
});
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
`needs` = dependencies; a non-`pass` required dependency blocks its dependents; `optional` nodes never block or fail the DAG; an unmet `when` skips a node (counts green); cycles are detected before any work runs. `sequence(name, ...jobs)` and `parallel(name, jobs, concurrency?)` are sugar over `dag`.
|
|
383
|
+
|
|
384
|
+
**Worktree isolation: branches as teams.** A concurrent node can run in its own git worktree on a fork branch (`isolation: 'worktree'` on the DAG, or `isolate: true` per node), so parallel writers never collide on files or the index. On pass, its committed work lands back into the line with a `--no-ff` merge; a conflict fails the node honestly (loops does not auto-resolve; that's a separate layer). Each team gets its own branch, its own scratch files, and (with `DagConfig.environment`) its own stage, all born and torn down together.
|
|
385
|
+
|
|
386
|
+
For **dynamic** dispatch (a loop that discovers each unit at runtime and routes it to its own isolated sub-loop), `isolated(job)` is the same boundary as a composable wrapper rather than a predeclared node (fork, run, land back on pass):
|
|
387
|
+
|
|
388
|
+
```ts
|
|
389
|
+
loop({ name: 'triage', until: queueEmpty, body: pickAndDispatch });
|
|
390
|
+
// where pickAndDispatch routes each ticket to isolated(convergeLoop) or isolated(sweep)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
## Loop archetypes: Converge, Sweep, Tend
|
|
394
|
+
|
|
395
|
+
A loop is not one shape. Three recur, and they differ in what memory does and in what you can even measure: a harness built for one is blind to the others.
|
|
396
|
+
|
|
397
|
+
| | **Converge** | **Sweep** | **Tend** |
|
|
398
|
+
| --- | --- | --- | --- |
|
|
399
|
+
| shape | one hard target, retried | a known set, one fresh task each | an unbounded process picking the next unit |
|
|
400
|
+
| example | build to a high bar with tests | research each OEM | triage issues until none remain |
|
|
401
|
+
| iteration N vs N−1 | the **same** task | an **independent** task | a **discovered** task |
|
|
402
|
+
| terminates when | the gate passes | the worklist is empty | a dynamic condition (maybe never) |
|
|
403
|
+
| memory's job | don't re-walk dead ends | transfer the house style | remember what's done + decided, forever |
|
|
404
|
+
| `loops` shape | `loop({ until: gate, max })` | `loop`/`dag` over a worklist | `loop({ until: dynamic, max: ∞ })` |
|
|
405
|
+
|
|
406
|
+
They **nest**: GitHub triage is Tend ∘ Converge (pick the next ticket, classify it, dispatch a Converge loop to a test gate); OEM research is Sweep ∘ Converge (each item is itself a multi-step build that must converge). Because a `loop` and a `dag` are both `Job`s, dispatch is just a body that selects a sub-`Job`. Wrap it in `isolated()` when each needs its own worktree. The Ledger's three tiers (scratch files → milestone commits → consolidated ledger) map onto the three nesting levels.
|
|
407
|
+
|
|
408
|
+
There is no `converge()` / `sweep()` / `tend()` in the API. They are patterns, not primitives. Copy-paste recipes for each (and the nested dispatch) are in [docs/patterns.md](docs/patterns.md); the full treatment is in [docs/concepts.md](docs/concepts.md).
|
|
409
|
+
|
|
410
|
+
## Budget, records, resume
|
|
411
|
+
|
|
412
|
+
Four opt-in `RunOptions` (with matching CLI flags). All default off.
|
|
413
|
+
|
|
414
|
+
| Option | CLI flag | Effect |
|
|
415
|
+
| ------------ | -------------------- | ------------------------------------------------------------------------------------- |
|
|
416
|
+
| `budget` | `--budget <n>` | Cap total tokens for the run. Engine calls refuse once the cap is hit. |
|
|
417
|
+
| `recordTo` | `--record <path>` | Append every structured event as JSONL: a readable, queryable run record. |
|
|
418
|
+
| `checkpoint` | `--checkpoint <p>` | Snapshot the shared `ctx.state` at each loop/dag/job boundary (latest-wins). |
|
|
419
|
+
| `resumeFrom` | `--resume <path>` | Restore the `ctx.state` a prior `--checkpoint` wrote, so a re-run continues warm. |
|
|
420
|
+
|
|
421
|
+
```ts
|
|
422
|
+
await run(job, { budget: 2_000_000, recordTo: '.loops/run.jsonl', checkpoint: '.loops/state.json' });
|
|
423
|
+
// later, after a crash or a deliberate stop:
|
|
424
|
+
await run(job, { resumeFrom: '.loops/state.json' });
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
`budget` is the cost guard for a loop that fires a worker plus several judges per iteration: `max` bounds the call _count_, `budget` bounds their _cost_ (`{ limit, headroom, soft }` for a soft warn-don't-refuse mode).
|
|
428
|
+
|
|
429
|
+
### Rate limits, quotas, and budgets: wait or resume
|
|
430
|
+
|
|
431
|
+
When a run hits a provider **rate limit**, an account **usage allowance**, or its own **token budget**, the `onLimit` policy decides what happens. The default, `auto`, **waits** when the reset is known and within a cap, otherwise **checkpoints and exits** with a ready-to-paste resume command.
|
|
432
|
+
|
|
433
|
+
| Option | CLI flag | Default | Effect |
|
|
434
|
+
| ----------- | ----------------------- | ------- | -------------------------------------------------------------------------------------------------- |
|
|
435
|
+
| `onLimit` | `--on-limit <policy>` | `auto` | `auto` waits a known reset ≤ `maxWaitMs`, else pauses · `wait` always waits a known reset · `exit-resume` never waits · `fail` is the old fatal behaviour |
|
|
436
|
+
| `maxWaitMs` | `--max-wait <dur>` | `300000` (5m) | Ceiling on a single interruptible limit-wait under `auto`/`wait`. |
|
|
437
|
+
|
|
438
|
+
A wait is **interruptible** (Ctrl-C unwinds it). When the policy gives up (the reset is unknown, the wait exceeds `maxWaitMs`, or the policy is `exit-resume`, and always for a `budget`, which never refreshes mid-run), the run ends with the terminal status **`paused`** (exit code **75**, `EX_TEMPFAIL`, distinct from `fail`'s `1`) so a wrapper/cron can tell "paused, resumable" from "failed". With `--checkpoint` set, the resume command is printed ready to paste; without one, the guidance says to re-run with `--checkpoint` to make a pause resumable.
|
|
439
|
+
|
|
440
|
+
The error taxonomy backs this: an engine classifies a throttle into a `RATE_LIMIT` or `QUOTA` `LoopError` carrying the reset hint (`retryAfterMs` / `resetAt`) it could read. `RATE_LIMIT` is retryable; `QUOTA` is retryable only when a reset is known; `BUDGET` never is.
|
|
441
|
+
|
|
442
|
+
## Output: TUI, plain, JSON
|
|
443
|
+
|
|
444
|
+
- **Ink TUI** (default on a TTY): a live loop/dag tree, a per-iteration detail panel you can browse while the run continues, and a stats footer. Navigate with `↑/↓` (nodes), `←/→` (iterations), `f`/`space` (follow-live), `q`/`Esc`/`Ctrl-C` (abort).
|
|
445
|
+
- **`--no-tui`**: streamed line logs, one concise report per completed iteration, e.g. `↳ iter 2: body=fail · until=not met · review=fail (needs X) · 1.2k/0.3k tok`.
|
|
446
|
+
- **`--json`**: NDJSON event stream on stdout.
|
|
447
|
+
|
|
448
|
+
Every mode ends with a summary: result, per-loop iterations, review tallies, token usage by model, and any errors.
|
|
449
|
+
|
|
450
|
+
## What `loops` is (and isn't)
|
|
451
|
+
|
|
452
|
+
`loops` is a **fresh-context loop primitive**, not a durable workflow engine. The design bet is that **the workspace is the state**: progress _and its reasoning_ live in git (the Ledger), so each iteration can start clean and still know what came before. If the process dies mid-run, you re-run against the same workspace (the worktree holds the files, the scratch files hold the why, the log holds the milestones) and continue. You lose the bookkeeping, not the work.
|
|
453
|
+
|
|
454
|
+
It deliberately does **not** do durable mid-run replay (re-running a half-finished graph and skipping completed steps). That's an orchestration concern; for it, embed a `loops` job as a step inside [Temporal](https://temporal.io), [LangGraph](https://github.com/langchain-ai/langgraphjs), or [Mastra](https://mastra.ai). What it _does_ offer (run records, a thin state checkpoint, a token budget) is the lightweight version that fits the workspace-is-state model.
|
|
455
|
+
|
|
456
|
+
| You want… | Reach for… |
|
|
457
|
+
| -------------------------------------------------- | ----------------------------------- |
|
|
458
|
+
| Loop an agent to convergence with a real done-gate | **loops** (you're here) |
|
|
459
|
+
| Durable, resumable, replayable workflows | Temporal / LangGraph / Mastra |
|
|
460
|
+
| One agent call with tool use | your provider's SDK directly |
|
|
461
|
+
|
|
462
|
+
## Roadmap
|
|
463
|
+
|
|
464
|
+
- [x] **Ledger**, git-memory core: the scratch files (working memory + handoff), grounding, milestone commits
|
|
465
|
+
- [x] Worktree isolation (branches-as-teams) with `--no-ff` land-back
|
|
466
|
+
- [x] Environment axis: provider interface + offline mock
|
|
467
|
+
- [ ] Publish to npm (with a built `dist` + `exports`)
|
|
468
|
+
- [ ] Optional `wip:` autosave tier (per-iteration recovery, squashed on convergence)
|
|
469
|
+
- [ ] No-progress / stall detection: the third hard stop, alongside `max` and `budget`
|
|
470
|
+
- [ ] `cost per accepted change` as a first-class reported metric
|
|
471
|
+
- [ ] Calibration helpers for agent judges
|
|
472
|
+
- [ ] More engine adapters (OpenAI, local models)
|
|
473
|
+
- [ ] Scrollable per-iteration transcript in the TUI
|
|
474
|
+
|
|
475
|
+
## Develop
|
|
476
|
+
|
|
477
|
+
```bash
|
|
478
|
+
npm test # vitest: offline, deterministic via the mock engine
|
|
479
|
+
npm run typecheck # tsc --noEmit
|
|
480
|
+
```
|
|
481
|
+
|
|
482
|
+
Contributions welcome. Open an issue to discuss anything substantial first. Keep the core small; that smallness is the point.
|
|
483
|
+
|
|
484
|
+
## License
|
|
485
|
+
|
|
486
|
+
[MIT](./LICENSE)
|
package/bin/loops.mjs
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Thin launcher. Registers tsx's ESM loader globally so the CLI can transform a
|
|
3
|
+
// user's `.loop.ts` recipe from any repo (the run-from-anywhere contract), then
|
|
4
|
+
// hands off to the CLI. In a published install loops' own code is the built
|
|
5
|
+
// `dist/`; running from source (this repo, no build step) falls back to the
|
|
6
|
+
// TypeScript entry, which the same tsx loader transforms.
|
|
7
|
+
import { existsSync } from 'node:fs';
|
|
8
|
+
import { fileURLToPath, pathToFileURL } from 'node:url';
|
|
9
|
+
import { dirname, join } from 'node:path';
|
|
10
|
+
import { register } from 'tsx/esm/api';
|
|
11
|
+
|
|
12
|
+
register();
|
|
13
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
14
|
+
const dist = join(here, '..', 'dist', 'index.js');
|
|
15
|
+
const entry = existsSync(dist) ? dist : join(here, '..', 'src', 'index.ts');
|
|
16
|
+
await import(pathToFileURL(entry).href);
|