agentfootprint 6.25.0 → 6.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +194 -48
- package/dist/esm/lib/context-bisect/ablation.js +183 -0
- package/dist/esm/lib/context-bisect/ablation.js.map +1 -0
- package/dist/esm/lib/context-bisect/bisect.js +129 -0
- package/dist/esm/lib/context-bisect/bisect.js.map +1 -0
- package/dist/esm/lib/context-bisect/index.js +22 -0
- package/dist/esm/lib/context-bisect/index.js.map +1 -0
- package/dist/esm/lib/context-bisect/llmEdgeWeigher.js +0 -0
- package/dist/esm/lib/context-bisect/llmEdgeWeigher.js.map +1 -0
- package/dist/esm/lib/context-bisect/localize.js +555 -0
- package/dist/esm/lib/context-bisect/localize.js.map +1 -0
- package/dist/esm/lib/context-bisect/types.js +56 -0
- package/dist/esm/lib/context-bisect/types.js.map +1 -0
- package/dist/esm/lib/tool-lint/cli.js +6 -1
- package/dist/esm/lib/tool-lint/cli.js.map +1 -1
- package/dist/esm/observe.js +7 -0
- package/dist/esm/observe.js.map +1 -1
- package/dist/lib/context-bisect/ablation.js +192 -0
- package/dist/lib/context-bisect/ablation.js.map +1 -0
- package/dist/lib/context-bisect/bisect.js +133 -0
- package/dist/lib/context-bisect/bisect.js.map +1 -0
- package/dist/lib/context-bisect/index.js +40 -0
- package/dist/lib/context-bisect/index.js.map +1 -0
- package/dist/lib/context-bisect/llmEdgeWeigher.js +0 -0
- package/dist/lib/context-bisect/llmEdgeWeigher.js.map +1 -0
- package/dist/lib/context-bisect/localize.js +563 -0
- package/dist/lib/context-bisect/localize.js.map +1 -0
- package/dist/lib/context-bisect/types.js +59 -0
- package/dist/lib/context-bisect/types.js.map +1 -0
- package/dist/lib/tool-lint/cli.js +30 -2
- package/dist/lib/tool-lint/cli.js.map +1 -1
- package/dist/observe.js +42 -20
- package/dist/observe.js.map +1 -1
- package/dist/types/lib/context-bisect/ablation.d.ts +97 -0
- package/dist/types/lib/context-bisect/ablation.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/bisect.d.ts +76 -0
- package/dist/types/lib/context-bisect/bisect.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/index.d.ts +22 -0
- package/dist/types/lib/context-bisect/index.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/llmEdgeWeigher.d.ts +125 -0
- package/dist/types/lib/context-bisect/llmEdgeWeigher.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/localize.d.ts +119 -0
- package/dist/types/lib/context-bisect/localize.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/types.d.ts +356 -0
- package/dist/types/lib/context-bisect/types.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/cli.d.ts.map +1 -1
- package/dist/types/observe.d.ts +1 -0
- package/dist/types/observe.d.ts.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,17 +1,23 @@
|
|
|
1
1
|
|
|
2
|
+
|
|
3
|
+
<h1 align="center">Agentfootprint</h1>
|
|
4
|
+
|
|
2
5
|
<p align="center">
|
|
3
|
-
<
|
|
4
|
-
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/hero-dark.svg">
|
|
5
|
-
<source media="(prefers-color-scheme: light)" srcset="docs/assets/hero-light.svg">
|
|
6
|
-
<img alt="agentfootprint mascot composing context flavors (Skills, Steering, Guardrails, RAG, Tool APIs, Memory) into three structured LLM slots (system, messages, tools) — the central abstraction, visualized." src="docs/assets/hero-light.svg" width="100%"/>
|
|
7
|
-
</picture>
|
|
6
|
+
<strong>Your agent picked the wrong tool, gave a wrong answer — and the logs can't tell you why.<br/>Agentfootprint can.</strong>
|
|
8
7
|
</p>
|
|
9
8
|
|
|
10
|
-
<
|
|
9
|
+
<p align="center">
|
|
10
|
+
The explainable agent framework: every read, write, decision, and tool call becomes
|
|
11
|
+
<strong>connected evidence</strong> as your agent runs. When something goes wrong, you don't grep logs — you ask.
|
|
12
|
+
</p>
|
|
11
13
|
|
|
12
14
|
<p align="center">
|
|
13
|
-
<
|
|
14
|
-
|
|
15
|
+
<a href="https://footprintjs.github.io/agentThinkingUI/">
|
|
16
|
+
<img src="docs/assets/hero-atui.png" alt="An agent run replayed in AgentThinkingUI — the LLM 'brain' calls the Flight-search tool, the step inspector shows the tool's raw output and the brain's reasoning about it, and the timeline scrubs every step of the run." width="100%">
|
|
17
|
+
</a>
|
|
18
|
+
</p>
|
|
19
|
+
<p align="center">
|
|
20
|
+
<sub>A real run, replayed — rendered with <a href="https://github.com/footprintjs/agentThinkingUI"><b>AgentThinkingUI</b></a> (<code>npm i agentthinkingui</code>). Every frame is generated from the run's own trace; <a href="https://footprintjs.github.io/agentThinkingUI/">▶ watch it live</a>.</sub>
|
|
15
21
|
</p>
|
|
16
22
|
|
|
17
23
|
<p align="center">
|
|
@@ -26,7 +32,176 @@
|
|
|
26
32
|
|
|
27
33
|
---
|
|
28
34
|
|
|
29
|
-
##
|
|
35
|
+
## The new error class
|
|
36
|
+
|
|
37
|
+
For decades, software had two kinds of errors — and developers never needed deep
|
|
38
|
+
domain knowledge to fix either:
|
|
39
|
+
|
|
40
|
+
| Error class | Where the bug lives | How you find it |
|
|
41
|
+
|---|---|---|
|
|
42
|
+
| **Infrastructure** — crash, timeout, 500 | the system | infra logs, monitoring |
|
|
43
|
+
| **Business logic** — wrong branch, wrong math | the code | stack trace, debugger, `console.log` |
|
|
44
|
+
| **Contextual** — wrong tool chosen, wrong fact believed, stale memory trusted | **what the model was given** | **nothing. Until now.** |
|
|
45
|
+
|
|
46
|
+
Agents introduced the third class. The code is correct, the infra is healthy — and
|
|
47
|
+
the run is still wrong, because two tool descriptions read alike, or an injected
|
|
48
|
+
fact was misleading, or memory carried last week's truth. Classical logs can't
|
|
49
|
+
explain it: **they record what the code did, never what the context did.**
|
|
50
|
+
|
|
51
|
+
## The idea
|
|
52
|
+
|
|
53
|
+
If contextual errors live in what the model was given, then the run itself must be
|
|
54
|
+
structured so context is **evidence** — every injection, read, write, decision, and
|
|
55
|
+
tool call recorded *connected*, the moment it happens. Not logs you grep. Evidence
|
|
56
|
+
you ask.
|
|
57
|
+
|
|
58
|
+
## How — we abstract context engineering
|
|
59
|
+
|
|
60
|
+
Every piece of context enters the LLM through one of **3 slots** (`system` ·
|
|
61
|
+
`messages` · `tools`), under one of **4 triggers** — skills, steering, RAG, facts,
|
|
62
|
+
memory, guardrails are all the same move: `Injection = slot × trigger × cache`.
|
|
63
|
+
|
|
64
|
+
**Because the framework owns that injection point, every piece of context is born
|
|
65
|
+
tracked.** Tracking isn't an add-on you wire up — it's a consequence of the
|
|
66
|
+
abstraction. [The full model ↓](#the-model--what-we-abstract)
|
|
67
|
+
|
|
68
|
+
<p align="center">
|
|
69
|
+
<picture>
|
|
70
|
+
<source media="(prefers-color-scheme: dark)" srcset="docs/assets/hero-dark.svg">
|
|
71
|
+
<source media="(prefers-color-scheme: light)" srcset="docs/assets/hero-light.svg">
|
|
72
|
+
<img alt="agentfootprint mascot composing context flavors (Skills, Steering, Guardrails, RAG, Tool APIs, Memory) into three structured LLM slots (system, messages, tools) — the central abstraction, visualized." src="docs/assets/hero-light.svg" width="100%"/>
|
|
73
|
+
</picture>
|
|
74
|
+
</p>
|
|
75
|
+
|
|
76
|
+
## What tracking buys you
|
|
77
|
+
|
|
78
|
+
**See it in 30 seconds** — four questions logs can't answer, each answered by code in this repo from a real run:
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
Q: Why did the model pick refund_full instead of refund_partial?
|
|
82
|
+
A: margin 0.02 — ⚠ NARROW: the two tool descriptions read nearly identical
|
|
83
|
+
(toolChoiceRecorder — and the catalog lint flags the pair before you ever run)
|
|
84
|
+
|
|
85
|
+
Q: Why was this loan declined?
|
|
86
|
+
A: decision ← [control: "DTI above the 0.43 affordability ceiling"] ← dti 0.52 ← monthlyDebt / income
|
|
87
|
+
(decide() evidence + the causal slice — every hop is a real recorded edge)
|
|
88
|
+
|
|
89
|
+
Q: Which piece of context made the answer wrong?
|
|
90
|
+
A: CAUSAL: ablating fact 'vip-override' flipped the outcome in 3/3 seeded reruns
|
|
91
|
+
(localizeContextBug — ranked proxies, counterfactual proof)
|
|
92
|
+
|
|
93
|
+
Q: Prove nobody edited this run's record.
|
|
94
|
+
A: verifyAuditBundle → valid: false, brokenAt: #16 — the tampered record, named
|
|
95
|
+
(hash-chained audit export, offline verification)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
And you don't have to read the trace yourself — **we provide the tools for an LLM to track it for you**: the trace toolpack let a debugger model find a planted bug while reading **9.5% of the trace** ([guide](docs/guides/trace-debugging.md)).
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Pick your door
|
|
103
|
+
|
|
104
|
+
| 🔧 Building an agent? | 🐛 Agent misbehaving? | 🏛️ Need audit / compliance? |
|
|
105
|
+
|---|---|---|
|
|
106
|
+
| Typed agents with skills, steering, RAG, memory, guardrails — and the trace for free. | Lint your tool catalog in 5 minutes — works on **any** framework's tool list (plain JSON / MCP / OpenAI / Anthropic shapes). Then causal slices, context bisection, and the debugger-LLM toolpack. | Hash-chained, tamper-evident run records with an offline verifier — record-keeping in the EU-AI-Act shape. |
|
|
107
|
+
| [→ Quick start](#quick-start--runs-offline-no-api-key) | [→ Tool-catalog lint](docs/guides/tool-catalog-lint.md) · [→ Trace debugging](docs/guides/trace-debugging.md) | [→ Tamper-evident audit](docs/guides/security.md) |
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Quick start — runs offline, no API key
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
npm install agentfootprint footprintjs
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
```typescript
|
|
118
|
+
import { Agent, defineTool, mock } from 'agentfootprint';
|
|
119
|
+
|
|
120
|
+
const weather = defineTool({
|
|
121
|
+
name: 'weather',
|
|
122
|
+
description: 'Get current weather for a city.',
|
|
123
|
+
inputSchema: {
|
|
124
|
+
type: 'object',
|
|
125
|
+
properties: { city: { type: 'string' } },
|
|
126
|
+
required: ['city'],
|
|
127
|
+
},
|
|
128
|
+
execute: async ({ city }: { city: string }) => `${city}: 72°F, sunny`,
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
const agent = Agent.create({
|
|
132
|
+
provider: mock({ reply: 'I checked: it is 72°F and sunny.' }),
|
|
133
|
+
model: 'mock',
|
|
134
|
+
})
|
|
135
|
+
.system('You answer weather questions using the weather tool.')
|
|
136
|
+
.tool(weather)
|
|
137
|
+
.build();
|
|
138
|
+
|
|
139
|
+
const result = await agent.run({ message: 'Weather in Paris?' });
|
|
140
|
+
console.log(result); // → "I checked: it is 72°F and sunny."
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
For production, import a real provider from `agentfootprint/llm-providers` and swap it in — `anthropic(...)` / `openai(...)` / `bedrock(...)` / `ollama(...)`. Only the import line changes; the agent code stays the same. (The vendor-SDK providers live on the `agentfootprint/llm-providers` subpath so the main `agentfootprint` barrel stays free of optional peer-dep requires; `mock`, `browserAnthropic`, and `browserOpenai` are on the main barrel.)
|
|
144
|
+
|
|
145
|
+
### Then add context
|
|
146
|
+
|
|
147
|
+
A real agent carries more than one prompt and one tool: facts about the user, always-on rules, skills that unlock on demand. Declare each piece — the framework decides **when** it fires and **which slot** it lands in, and every piece is born tracked:
|
|
148
|
+
|
|
149
|
+
```typescript
|
|
150
|
+
import { defineFact, defineSteering, defineSkill } from 'agentfootprint';
|
|
151
|
+
|
|
152
|
+
const agent = Agent.create({ provider, model })
|
|
153
|
+
.system('You are a support agent.')
|
|
154
|
+
.fact(defineFact({ // data the model should know — always on
|
|
155
|
+
id: 'user-profile',
|
|
156
|
+
data: 'Name: Maya · Plan: Pro · Customer since 2022',
|
|
157
|
+
}))
|
|
158
|
+
.steering(defineSteering({ // rules the model must follow — always on
|
|
159
|
+
id: 'refund-policy',
|
|
160
|
+
prompt: 'Never promise a refund before checking the policy tool.',
|
|
161
|
+
}))
|
|
162
|
+
.skill(defineSkill({ // guidance + tools — unlocks when the LLM asks
|
|
163
|
+
id: 'billing',
|
|
164
|
+
description: 'Use for refunds, charges, billing questions.',
|
|
165
|
+
body: 'When handling billing: confirm identity first, then…',
|
|
166
|
+
tools: [refundTool],
|
|
167
|
+
}))
|
|
168
|
+
.build();
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Same shape for `.instruction()` / `.memory()` / `.rag()` / raw `.injection()` — they're all the one primitive, `Injection = slot × trigger × cache`. [The full model ↓](#the-model--what-we-abstract)
|
|
172
|
+
|
|
173
|
+
### Then compose control flow
|
|
174
|
+
|
|
175
|
+
One agent is a `Runner`. So is every composition of agents — four control-flow primitives, and anything that runs composes into anything else:
|
|
176
|
+
|
|
177
|
+
```typescript
|
|
178
|
+
import { Sequence, Parallel, Conditional } from 'agentfootprint';
|
|
179
|
+
|
|
180
|
+
const pipeline = Sequence.create()
|
|
181
|
+
.step('classify', classifyAgent) // sequence: step → step
|
|
182
|
+
.step('review',
|
|
183
|
+
Parallel.create() // parallel: fan out, then merge
|
|
184
|
+
.branch('legal', legalAgent)
|
|
185
|
+
.branch('ethics', ethicsAgent)
|
|
186
|
+
.mergeWithLLM({ provider, model, prompt: 'Synthesize:' })
|
|
187
|
+
.build())
|
|
188
|
+
.step('respond',
|
|
189
|
+
Conditional.create() // conditional: one branch runs
|
|
190
|
+
.when('urgent', (i) => i.message.startsWith('URGENT'), urgentAgent)
|
|
191
|
+
.otherwise('normal', normalAgent)
|
|
192
|
+
.build())
|
|
193
|
+
.build();
|
|
194
|
+
|
|
195
|
+
await pipeline.run({ message: 'URGENT: refund dispute on order #4411' });
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
The fourth primitive is `Loop` — `Loop.repeat(agent).until(guard).times(5)`, with a mandatory budget guard. And the named patterns from the research literature ship pre-composed from the same four: `selfConsistency` · `reflection` · `debate` · `mapReduce` · `tot` · `swarm`. Because every composition is a flowchart, the structure you wrote is the structure you see in the UI — and the trace spans the whole pipeline, not one agent at a time. [Designing systems of agents ↓](#how-do-i-design-my-agent-or-system-of-agents)
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## The model — what we abstract
|
|
203
|
+
|
|
204
|
+
|
|
30
205
|
|
|
31
206
|
When you build an Agentic Application, you collect domain-specific data and instructions, then wire them up based on what your system receives.
|
|
32
207
|
|
|
@@ -70,7 +245,7 @@ That's the whole model: `Injection = slot × trigger × cache`.
|
|
|
70
245
|
|
|
71
246
|
---
|
|
72
247
|
|
|
73
|
-
##
|
|
248
|
+
## Why we chose this abstraction
|
|
74
249
|
|
|
75
250
|
The agent space has many credible primary abstractions:
|
|
76
251
|
|
|
@@ -143,7 +318,7 @@ And a fourth, novel: **the agent can read its own trace.** Six months after the
|
|
|
143
318
|
|
|
144
319
|
---
|
|
145
320
|
|
|
146
|
-
##
|
|
321
|
+
## How do I design my agent or system of agents?
|
|
147
322
|
|
|
148
323
|
Two scales — same alphabet. Four control flows are the entire vocabulary.
|
|
149
324
|
|
|
@@ -305,7 +480,14 @@ Same trick as Beat 1: instead of N libraries for N patterns, we found the M buil
|
|
|
305
480
|
|
|
306
481
|
---
|
|
307
482
|
|
|
308
|
-
##
|
|
483
|
+
## How do I see what my agent did?
|
|
484
|
+
|
|
485
|
+
<p align="center">
|
|
486
|
+
<img src="docs/assets/lens-run.png" alt="A real agent run in the Lens: the conversation (with live PII redaction), the executed path lit on the merge-tree flowchart, the WHAT-HAPPENED timeline of every iteration/context/LLM turn/route, run stats, and the step inspector — all generated from the run's own trace." width="100%">
|
|
487
|
+
</p>
|
|
488
|
+
<p align="center">
|
|
489
|
+
<sub>One real run, fully explained — the <a href="https://github.com/footprintjs/agentfootprint-lens"><b>Lens</b></a> (<code>npm i agentfootprint-lens</code>): conversation · executed path · per-step timeline · stats, every pixel from the trace.</sub>
|
|
490
|
+
</p>
|
|
309
491
|
|
|
310
492
|
Because we own the loop (Beat 2), every decision and execution is captured during traversal — not bolted on. The default capture is the **causal trace**: every stage, read, write, and decision evidence, as a JSON-portable, scrubbable, queryable, exportable artifact. Beyond the default, wire custom recorders for cost, latency, or quality scoring — any observation hook fires on the same stream.
|
|
311
493
|
|
|
@@ -427,42 +609,6 @@ off the hot path.
|
|
|
427
609
|
|
|
428
610
|
---
|
|
429
611
|
|
|
430
|
-
## Quick start — runs offline, no API key
|
|
431
|
-
|
|
432
|
-
```bash
|
|
433
|
-
npm install agentfootprint footprintjs
|
|
434
|
-
```
|
|
435
|
-
|
|
436
|
-
```typescript
|
|
437
|
-
import { Agent, defineTool, mock } from 'agentfootprint';
|
|
438
|
-
|
|
439
|
-
const weather = defineTool({
|
|
440
|
-
name: 'weather',
|
|
441
|
-
description: 'Get current weather for a city.',
|
|
442
|
-
inputSchema: {
|
|
443
|
-
type: 'object',
|
|
444
|
-
properties: { city: { type: 'string' } },
|
|
445
|
-
required: ['city'],
|
|
446
|
-
},
|
|
447
|
-
execute: async ({ city }: { city: string }) => `${city}: 72°F, sunny`,
|
|
448
|
-
});
|
|
449
|
-
|
|
450
|
-
const agent = Agent.create({
|
|
451
|
-
provider: mock({ reply: 'I checked: it is 72°F and sunny.' }),
|
|
452
|
-
model: 'mock',
|
|
453
|
-
})
|
|
454
|
-
.system('You answer weather questions using the weather tool.')
|
|
455
|
-
.tool(weather)
|
|
456
|
-
.build();
|
|
457
|
-
|
|
458
|
-
const result = await agent.run({ message: 'Weather in Paris?' });
|
|
459
|
-
console.log(result); // → "I checked: it is 72°F and sunny."
|
|
460
|
-
```
|
|
461
|
-
|
|
462
|
-
For production, import a real provider from `agentfootprint/llm-providers` and swap it in — `anthropic(...)` / `openai(...)` / `bedrock(...)` / `ollama(...)`. Only the import line changes; the agent code stays the same. (The vendor-SDK providers live on the `agentfootprint/llm-providers` subpath so the main `agentfootprint` barrel stays free of optional peer-dep requires; `mock`, `browserAnthropic`, and `browserOpenai` are on the main barrel.)
|
|
463
|
-
|
|
464
|
-
---
|
|
465
|
-
|
|
466
612
|
## Mocks first, production second
|
|
467
613
|
|
|
468
614
|
Build the entire app against in-memory mocks with **zero API cost**, then swap real infrastructure one boundary at a time.
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ablation — the counterfactual seam (RFC-003 Part B, D8 stage 4 + the
|
|
3
|
+
* D9 stats engine).
|
|
4
|
+
*
|
|
5
|
+
* Three pieces:
|
|
6
|
+
*
|
|
7
|
+
* 1. **Adapters** — `ablationForSuspect` maps a classified suspect to
|
|
8
|
+
* the spec that removes it (tool → drop from catalog; injection /
|
|
9
|
+
* fact / skill → exclude the `Injection.id`; memory → filter the
|
|
10
|
+
* `MemoryEntry.id`; arg → consumer-override note).
|
|
11
|
+
*
|
|
12
|
+
* 2. **The seam** — `applyAblations` filters the inputs an agent is
|
|
13
|
+
* BUILT from. Documented here because the seam did not previously
|
|
14
|
+
* exist: `AgentOptions` has no `ignoredTools` runtime kill-switch, so
|
|
15
|
+
* tool ablation happens at construction (the consumer's
|
|
16
|
+
* `AblationRunner` rebuilds the agent from filtered inputs). Same for
|
|
17
|
+
* injections and memory entries.
|
|
18
|
+
*
|
|
19
|
+
* 3. **The probe engine** — `runAblationProbe` calls the consumer's
|
|
20
|
+
* runner N seeded times, measures embedding similarity to the
|
|
21
|
+
* original output, counts outcome flips, and returns variance —
|
|
22
|
+
* never a single-run verdict (D9 discipline).
|
|
23
|
+
*
|
|
24
|
+
* §B2: only `runAblationProbe`-derived verdicts are causal claims; every
|
|
25
|
+
* score elsewhere is a correlational proxy.
|
|
26
|
+
*/
|
|
27
|
+
import { cosineSimilarity } from '../../memory/embedding/cosine.js';
|
|
28
|
+
import { CONTEXT_BISECT_DEFAULTS } from './types.js';
|
|
29
|
+
// ─── Adapters: suspect → spec ────────────────────────────────────────
|
|
30
|
+
/**
|
|
31
|
+
* The spec that removes one suspect — or `undefined` for kind `'stage'`
|
|
32
|
+
* (plain pipeline stages have no removable input; re-rank or refactor).
|
|
33
|
+
*/
|
|
34
|
+
export function ablationForSuspect(suspect) {
|
|
35
|
+
switch (suspect.kind) {
|
|
36
|
+
case 'tool':
|
|
37
|
+
return suspect.detail?.toolName !== undefined
|
|
38
|
+
? { kind: 'tool', ignoredTools: [suspect.detail.toolName] }
|
|
39
|
+
: undefined;
|
|
40
|
+
case 'injection':
|
|
41
|
+
return suspect.detail?.injectionId !== undefined
|
|
42
|
+
? { kind: 'injection', excludeInjectionIds: [suspect.detail.injectionId] }
|
|
43
|
+
: undefined;
|
|
44
|
+
case 'memory':
|
|
45
|
+
return suspect.detail?.injectionId !== undefined
|
|
46
|
+
? { kind: 'memory', excludeMemoryIds: [suspect.detail.injectionId] }
|
|
47
|
+
: undefined;
|
|
48
|
+
case 'arg':
|
|
49
|
+
return {
|
|
50
|
+
kind: 'arg',
|
|
51
|
+
source: suspect.source,
|
|
52
|
+
note: `step ${suspect.source} consumed untracked run input ($getArgs()/env) — ` +
|
|
53
|
+
`the runner must override the input itself; the library cannot filter it.`,
|
|
54
|
+
};
|
|
55
|
+
case 'stage':
|
|
56
|
+
return undefined;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Apply ablation specs to the inputs an agent is constructed from —
|
|
61
|
+
* THE documented seam (see module docs). Generic over the concrete tool /
|
|
62
|
+
* injection / memory-entry types so it filters without importing them.
|
|
63
|
+
*
|
|
64
|
+
* `'arg'` specs are deliberately NOT handled here: run input belongs to
|
|
65
|
+
* the consumer's runner (`spec.note` says so).
|
|
66
|
+
*
|
|
67
|
+
* @example inside an AblationRunner
|
|
68
|
+
* ```ts
|
|
69
|
+
* const { tools, injections } = applyAblations(specs, {
|
|
70
|
+
* tools: ALL_TOOLS, injections: ALL_FACTS,
|
|
71
|
+
* });
|
|
72
|
+
* const agent = Agent.create({ provider: freshProvider(), model })
|
|
73
|
+
* .tools([...tools]);
|
|
74
|
+
* for (const inj of injections) agent.fact(inj);
|
|
75
|
+
* ```
|
|
76
|
+
*/
|
|
77
|
+
export function applyAblations(specs, targets) {
|
|
78
|
+
const ignoredTools = new Set();
|
|
79
|
+
const excludedInjections = new Set();
|
|
80
|
+
const excludedMemory = new Set();
|
|
81
|
+
for (const spec of specs) {
|
|
82
|
+
if (spec.kind === 'tool')
|
|
83
|
+
for (const name of spec.ignoredTools)
|
|
84
|
+
ignoredTools.add(name);
|
|
85
|
+
if (spec.kind === 'injection')
|
|
86
|
+
for (const id of spec.excludeInjectionIds)
|
|
87
|
+
excludedInjections.add(id);
|
|
88
|
+
if (spec.kind === 'memory')
|
|
89
|
+
for (const id of spec.excludeMemoryIds)
|
|
90
|
+
excludedMemory.add(id);
|
|
91
|
+
}
|
|
92
|
+
return {
|
|
93
|
+
tools: (targets.tools ?? []).filter((tool) => !ignoredTools.has(tool.schema.name)),
|
|
94
|
+
injections: (targets.injections ?? []).filter((injection) => !excludedInjections.has(injection.id)),
|
|
95
|
+
memoryEntries: (targets.memoryEntries ?? []).filter((entry) => !excludedMemory.has(entry.id)),
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
// ─── The probe engine (D9 stats) ─────────────────────────────────────
|
|
99
|
+
function similarityStats(values) {
|
|
100
|
+
if (values.length === 0)
|
|
101
|
+
return { mean: 0, min: 0, max: 0, stdev: 0 };
|
|
102
|
+
const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
|
|
103
|
+
const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
|
|
104
|
+
return {
|
|
105
|
+
mean,
|
|
106
|
+
min: Math.min(...values),
|
|
107
|
+
max: Math.max(...values),
|
|
108
|
+
stdev: Math.sqrt(variance),
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
/** The default comparator: embedding similarity below the threshold. */
|
|
112
|
+
export function defaultOutcomeComparator(embedder, flipThreshold) {
|
|
113
|
+
return async (original, ablated) => {
|
|
114
|
+
const [a, b] = await Promise.all([
|
|
115
|
+
embedder.embed({ text: original }),
|
|
116
|
+
embedder.embed({ text: ablated }),
|
|
117
|
+
]);
|
|
118
|
+
return cosineSimilarity(a, b) < flipThreshold;
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Run ONE probe: call the consumer's runner with `specs` once per seed
|
|
123
|
+
* (0..samples-1), measure each output's embedding similarity to the
|
|
124
|
+
* original, and count outcome flips. Variance is always reported.
|
|
125
|
+
*
|
|
126
|
+
* `samples` is clamped to ≥ 2 — D9: never single-run verdicts.
|
|
127
|
+
*/
|
|
128
|
+
export async function runAblationProbe(config, specs) {
|
|
129
|
+
const samples = Math.max(2, config.rerun.samples ?? CONTEXT_BISECT_DEFAULTS.samples);
|
|
130
|
+
const flipThreshold = config.rerun.flipThreshold ?? CONTEXT_BISECT_DEFAULTS.flipThreshold;
|
|
131
|
+
const outcomeChanged = config.rerun.outcomeChanged ?? defaultOutcomeComparator(config.embedder, flipThreshold);
|
|
132
|
+
const similarities = [];
|
|
133
|
+
let flips = 0;
|
|
134
|
+
const originalVec = await config.embedder.embed({ text: config.rerun.originalOutput });
|
|
135
|
+
for (let seed = 0; seed < samples; seed++) {
|
|
136
|
+
const output = await config.rerun.runner(specs, { seed });
|
|
137
|
+
const outputVec = await config.embedder.embed({ text: output });
|
|
138
|
+
similarities.push(cosineSimilarity(originalVec, outputVec));
|
|
139
|
+
if (await outcomeChanged(config.rerun.originalOutput, output))
|
|
140
|
+
flips++;
|
|
141
|
+
}
|
|
142
|
+
return { samples, flips, similarity: similarityStats(similarities) };
|
|
143
|
+
}
|
|
144
|
+
/** Majority-flip rule shared by D8 verdicts and D9 probes. */
|
|
145
|
+
export function probeFlipped(stats) {
|
|
146
|
+
return stats.flips * 2 > stats.samples;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Translate probe evidence into the verdict — the ONLY causal claim tier
|
|
150
|
+
* (§B2). `baselineStable=false` (the un-ablated scenario itself flipped)
|
|
151
|
+
* forces `'inconclusive'`: no ablation verdict is trustworthy on an
|
|
152
|
+
* unstable baseline.
|
|
153
|
+
*/
|
|
154
|
+
export function verdictFor(label, stats, baselineStable) {
|
|
155
|
+
if (!baselineStable) {
|
|
156
|
+
return {
|
|
157
|
+
verdict: 'inconclusive',
|
|
158
|
+
claim: `INCONCLUSIVE: the un-ablated baseline itself changed outcome across seeded reruns — ` +
|
|
159
|
+
`no ablation verdict for ${label} is trustworthy on an unstable scenario.`,
|
|
160
|
+
};
|
|
161
|
+
}
|
|
162
|
+
if (probeFlipped(stats)) {
|
|
163
|
+
return {
|
|
164
|
+
verdict: 'confirmed',
|
|
165
|
+
claim: `CAUSAL: ablating ${label} flipped the outcome in ${stats.flips}/${stats.samples} ` +
|
|
166
|
+
`seeded reruns (mean similarity to original ${stats.similarity.mean.toFixed(3)} ` +
|
|
167
|
+
`± ${stats.similarity.stdev.toFixed(3)}).`,
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
if (stats.flips > 0) {
|
|
171
|
+
return {
|
|
172
|
+
verdict: 'inconclusive',
|
|
173
|
+
claim: `INCONCLUSIVE: ablating ${label} flipped only ${stats.flips}/${stats.samples} seeded ` +
|
|
174
|
+
`reruns — below majority; raise samples or check scenario stability.`,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
return {
|
|
178
|
+
verdict: 'not-confirmed',
|
|
179
|
+
claim: `NOT CONFIRMED: ablating ${label} did not change the outcome in ${stats.samples} seeded ` +
|
|
180
|
+
`reruns — its ranking remains a correlational proxy only.`,
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
//# sourceMappingURL=ablation.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ablation.js","sourceRoot":"","sources":["../../../../src/lib/context-bisect/ablation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,OAAO,EAAE,gBAAgB,EAAE,MAAM,kCAAkC,CAAC;AAWpE,OAAO,EAAE,uBAAuB,EAAE,MAAM,YAAY,CAAC;AAErD,wEAAwE;AAExE;;;GAGG;AACH,MAAM,UAAU,kBAAkB,CAAC,OAAgB;IACjD,QAAQ,OAAO,CAAC,IAAI,EAAE,CAAC;QACrB,KAAK,MAAM;YACT,OAAO,OAAO,CAAC,MAAM,EAAE,QAAQ,KAAK,SAAS;gBAC3C,CAAC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,EAAE;gBAC3D,CAAC,CAAC,SAAS,CAAC;QAChB,KAAK,WAAW;YACd,OAAO,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,SAAS;gBAC9C,CAAC,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,mBAAmB,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE;gBAC1E,CAAC,CAAC,SAAS,CAAC;QAChB,KAAK,QAAQ;YACX,OAAO,OAAO,CAAC,MAAM,EAAE,WAAW,KAAK,SAAS;gBAC9C,CAAC,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,gBAAgB,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,WAAW,CAAC,EAAE;gBACpE,CAAC,CAAC,SAAS,CAAC;QAChB,KAAK,KAAK;YACR,OAAO;gBACL,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,IAAI,EACF,QAAQ,OAAO,CAAC,MAAM,mDAAmD;oBACzE,0EAA0E;aAC7E,CAAC;QACJ,KAAK,OAAO;YACV,OAAO,SAAS,CAAC;IACrB,CAAC;AACH,CAAC;AAwBD;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,cAAc,CAK5B,KAA8B,EAC9B,OAAyD;IAMzD,MAAM,YAAY,GAAG,IAAI,GAAG,EAAU,CAAC;IACvC,MAAM,kBAAkB,GAAG,IAAI,GAAG,EAAU,CAAC;IAC7C,MAAM,cAAc,GAAG,IAAI,GAAG,EAAU,CAAC;IACzC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,IAAI,IAAI,CAAC,IAAI,KAAK,MAAM;YAAE,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,YAAY;gBAAE,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACvF,IAAI,IAAI,CAAC,IAAI,KAAK,WAAW;YAC3B,KAAK,MAAM,EAAE,IAAI,IAAI,CAAC,mBAAmB;gBAAE,kBAAkB,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;QACxE,IAAI,IAAI,CAAC,IAAI,KAAK,QAAQ;YAAE,KAAK,MAAM,EAAE,IAAI,IAAI,CAAC,gBAAgB;gBAAE,cAAc,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IAC7F,CAAC;IACD,OAAO;QACL,KAAK,EAAE,CAAC,OAAO,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;QAClF,UAAU,EAAE,CAAC,OAAO,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,MAAM,CAC3C,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,kBAAkB,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAC,CACrD;QACD,aAAa,EAAE,CAAC,OAAO,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;KAC9F,CAAC;AACJ,CAAC;AAED,wEAAwE;AAExE,SAAS,eAAe,CAAC,MAAyB;IAChD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC;IACtE,MAAM,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACnE,MAAM,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;IACrF,OAAO;QACL,IAAI;QACJ,GAAG,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC;QACxB,GAAG,EAAE,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC;QACxB,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC;KAC3B,CAAC;AACJ,CAAC;AAED,wEAAwE;AACxE,MAAM,UAAU,wBAAwB,CACtC,QAAkB,EAClB,aAAqB;IAErB,OAAO,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,EAAE;QACjC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC/B,QAAQ,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;YAClC,QAAQ,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;SAClC,CAAC,CAAC;QACH,OAAO,gBAAgB,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,aAAa,CAAC;IAChD,CAAC,CAAC;AACJ,CAAC;AAQD;;;;;;GAMG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB,CACpC,MAAmB,EACnB,KAA8B;IAE9B,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,MAAM,CAAC,KAAK,CAAC,OAAO,IAAI,uBAAuB,CAAC,OAAO,CAAC,CAAC;IACrF,MAAM,aAAa,GAAG,MAAM,CAAC,KAAK,CAAC,aAAa,IAAI,uBAAuB,CAAC,aAAa,CAAC;IAC1F,MAAM,cAAc,GAClB,MAAM,CAAC,KAAK,CAAC,cAAc,IAAI,wBAAwB,CAAC,MAAM,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;IAE1F,MAAM,YAAY,GAAa,EAAE,CAAC;IAClC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,MAAM,CAAC,KAAK,CAAC,cAAc,EAAE,CAAC,CAAC;IACvF,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,OAAO,EAAE,IAAI,EAAE,EAAE,CAAC;QAC1C,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,KAAK,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC;QAC1D,MAAM,SAAS,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;QAChE,YAAY,CAAC,IAAI,CAAC,gBAAgB,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC,CAAC;QAC5D,IAAI,MAAM,cAAc,CAAC,MAAM,CAAC,KAAK,CAAC,cAAc,EAAE,MAAM,CAAC;YAAE,KAAK,EAAE,CAAC;IACzE,CAAC;IACD,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,eAAe,CAAC,YAAY,CAAC,EAAE,CAAC;AACvE,CAAC;AAED,8DAA8D;AAC9D,MAAM,UAAU,YAAY,CAAC,KAAuB;IAClD,OAAO,KAAK,CAAC,KAAK,GAAG,CAAC,GAAG,KAAK,CAAC,OAAO,CAAC;AACzC,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CACxB,KAAa,EACb,KAAuB,EACvB,cAAuB;IAEvB,IAAI,CAAC,cAAc,EAAE,CAAC;QACpB,OAAO;YACL,OAAO,EAAE,cAAc;YACvB,KAAK,EACH,sFAAsF;gBACtF,2BAA2B,KAAK,0CAA0C;SAC7E,CAAC;IACJ,CAAC;IACD,IAAI,YAAY,CAAC,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO;YACL,OAAO,EAAE,WAAW;YACpB,KAAK,EACH,oBAAoB,KAAK,2BAA2B,KAAK,CAAC,KAAK,IAAI,KAAK,CAAC,OAAO,GAAG;gBACnF,8CAA8C,KAAK,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG;gBACjF,KAAK,KAAK,CAAC,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI;SAC7C,CAAC;IACJ,CAAC;IACD,IAAI,KAAK,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC;QACpB,OAAO;YACL,OAAO,EAAE,cAAc;YACvB,KAAK,EACH,0BAA0B,KAAK,iBAAiB,KAAK,CAAC,KAAK,IAAI,KAAK,CAAC,OAAO,UAAU;gBACtF,qEAAqE;SACxE,CAAC;IACJ,CAAC;IACD,OAAO;QACL,OAAO,EAAE,eAAe;QACxB,KAAK,EACH,2BAA2B,KAAK,kCAAkC,KAAK,CAAC,OAAO,UAAU;YACzF,0DAA0D;KAC7D,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bisectCulprits — multi-culprit bisection over the ranked suspect set
|
|
3
|
+
* (RFC-003 Part B, block D9). The "git bisect" of the localizer.
|
|
4
|
+
*
|
|
5
|
+
* When single-suspect ablations don't flip the outcome — redundant causes
|
|
6
|
+
* (two facts that EACH justify the wrong answer), or interacting ones —
|
|
7
|
+
* the culprit is a SET. This harness finds a minimal culprit set by
|
|
8
|
+
* recursive halving over the ranked suspects (delta-debugging style,
|
|
9
|
+
* Zeller's ddmin specialized to two-way splits), then keeps searching the
|
|
10
|
+
* remainder for INDEPENDENT culprits until the remainder stops flipping.
|
|
11
|
+
*
|
|
12
|
+
* Probe semantics (the D9 discipline):
|
|
13
|
+
* - every probe = N seeded reruns of the consumer's `AblationRunner`
|
|
14
|
+
* with the probe's combined specs; "flipped" = MAJORITY of runs
|
|
15
|
+
* changed outcome; similarity mean ± spread is always reported —
|
|
16
|
+
* never single-run verdicts;
|
|
17
|
+
* - probe 0 is the BASELINE (no ablation): if it flips, the scenario
|
|
18
|
+
* itself is unstable and the result is honestly `'inconclusive'`;
|
|
19
|
+
* - probes are cached by spec-set, and budgeted (`maxProbes`) — running
|
|
20
|
+
* out of budget yields `'inconclusive'`, never a partial claim
|
|
21
|
+
* dressed up as a finding.
|
|
22
|
+
*
|
|
23
|
+
* §B2: the returned `verdict`/`culprits` are CAUSAL claims — they rest
|
|
24
|
+
* exclusively on counterfactual reruns. The input ranking only chooses
|
|
25
|
+
* the SEARCH ORDER (better ranking = fewer probes), it never decides the
|
|
26
|
+
* outcome.
|
|
27
|
+
*/
|
|
28
|
+
import { probeFlipped, runAblationProbe } from './ablation.js';
|
|
29
|
+
import { CONTEXT_BISECT_DEFAULTS } from './types.js';
|
|
30
|
+
import { suspectLabel } from './localize.js';
|
|
31
|
+
// ─── The harness ─────────────────────────────────────────────────────
|
|
32
|
+
class ProbeBudgetExceeded extends Error {
|
|
33
|
+
constructor() {
|
|
34
|
+
super('probe budget exceeded');
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Find minimal culprit set(s) by seeded counterfactual bisection. See
|
|
39
|
+
* module docs for semantics and the §B2 claim tier.
|
|
40
|
+
*/
|
|
41
|
+
export async function bisectCulprits(options) {
|
|
42
|
+
const candidates = options.suspects.filter((suspect) => suspect.ablation !== undefined && suspect.ablation.kind !== 'arg');
|
|
43
|
+
const maxProbes = options.maxProbes ?? CONTEXT_BISECT_DEFAULTS.maxProbes;
|
|
44
|
+
const maxCulprits = options.maxCulprits ?? CONTEXT_BISECT_DEFAULTS.maxCulprits;
|
|
45
|
+
const config = { rerun: options.rerun, embedder: options.embedder };
|
|
46
|
+
const probes = [];
|
|
47
|
+
const cache = new Map();
|
|
48
|
+
let runsUsed = 0;
|
|
49
|
+
const keyOf = (set) => set
|
|
50
|
+
.map((suspect) => suspectLabel(suspect))
|
|
51
|
+
.sort()
|
|
52
|
+
.join('|');
|
|
53
|
+
async function probe(set) {
|
|
54
|
+
const key = keyOf(set);
|
|
55
|
+
const cached = cache.get(key);
|
|
56
|
+
if (cached !== undefined)
|
|
57
|
+
return cached;
|
|
58
|
+
if (probes.length >= maxProbes)
|
|
59
|
+
throw new ProbeBudgetExceeded();
|
|
60
|
+
const stats = await runAblationProbe(config, set.flatMap((suspect) => (suspect.ablation !== undefined ? [suspect.ablation] : [])));
|
|
61
|
+
runsUsed += stats.samples;
|
|
62
|
+
const flipped = probeFlipped(stats);
|
|
63
|
+
probes.push({ ablated: set.map((suspect) => suspectLabel(suspect)), stats, flipped });
|
|
64
|
+
cache.set(key, flipped);
|
|
65
|
+
return flipped;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Minimal subset of `candidates` that — together with `context` — flips
|
|
69
|
+
* the outcome. Precondition: probe(candidates ∪ context) flipped.
|
|
70
|
+
* Two-way ddmin: try each half; on interference (neither half alone
|
|
71
|
+
* suffices) minimize each half with the other as context.
|
|
72
|
+
*/
|
|
73
|
+
async function minimize(set, context) {
|
|
74
|
+
if (set.length <= 1)
|
|
75
|
+
return [...set];
|
|
76
|
+
const mid = Math.ceil(set.length / 2);
|
|
77
|
+
const top = set.slice(0, mid); // ranked order: the likelier half first
|
|
78
|
+
const rest = set.slice(mid);
|
|
79
|
+
if (await probe([...top, ...context]))
|
|
80
|
+
return minimize(top, context);
|
|
81
|
+
if (await probe([...rest, ...context]))
|
|
82
|
+
return minimize(rest, context);
|
|
83
|
+
// Interference: parts of BOTH halves are needed jointly.
|
|
84
|
+
const fromTop = await minimize(top, [...rest, ...context]);
|
|
85
|
+
const fromRest = await minimize(rest, [...fromTop, ...context]);
|
|
86
|
+
return [...fromTop, ...fromRest];
|
|
87
|
+
}
|
|
88
|
+
try {
|
|
89
|
+
// Baseline: an unstable scenario invalidates everything downstream.
|
|
90
|
+
// ZERO-TOLERANCE (review Finding 1): a single un-ablated flip marks the
|
|
91
|
+
// scenario unstable — the majority-rule probeFlipped() gate would let a
|
|
92
|
+
// 1-in-3-flaky scenario through to a 'confirmed' CAUSAL verdict, which
|
|
93
|
+
// violates the §B2 honest-claims discipline. Same gate localize.ts uses.
|
|
94
|
+
{
|
|
95
|
+
const baselineStats = await runAblationProbe(config, []);
|
|
96
|
+
runsUsed += baselineStats.samples;
|
|
97
|
+
const unstable = baselineStats.flips > 0;
|
|
98
|
+
probes.push({ ablated: [], stats: baselineStats, flipped: unstable });
|
|
99
|
+
cache.set(keyOf([]), probeFlipped(baselineStats));
|
|
100
|
+
if (unstable) {
|
|
101
|
+
return { verdict: 'inconclusive', culprits: [], probes, runsUsed };
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// Reproduction gate: the full ranked set must flip at all.
|
|
105
|
+
if (candidates.length === 0 || !(await probe(candidates))) {
|
|
106
|
+
return { verdict: 'not-reproducible', culprits: [], probes, runsUsed };
|
|
107
|
+
}
|
|
108
|
+
// Find minimal sets; then keep searching the remainder for
|
|
109
|
+
// INDEPENDENT culprits until it stops flipping.
|
|
110
|
+
const culprits = [];
|
|
111
|
+
let remaining = candidates;
|
|
112
|
+
for (let round = 0; round < maxCulprits; round++) {
|
|
113
|
+
const found = await minimize(remaining, []);
|
|
114
|
+
culprits.push(found);
|
|
115
|
+
const foundKeys = new Set(found.map((suspect) => suspectLabel(suspect)));
|
|
116
|
+
remaining = remaining.filter((suspect) => !foundKeys.has(suspectLabel(suspect)));
|
|
117
|
+
if (remaining.length === 0 || !(await probe(remaining)))
|
|
118
|
+
break;
|
|
119
|
+
}
|
|
120
|
+
return { verdict: 'confirmed', culprits, probes, runsUsed };
|
|
121
|
+
}
|
|
122
|
+
catch (error) {
|
|
123
|
+
if (error instanceof ProbeBudgetExceeded) {
|
|
124
|
+
return { verdict: 'inconclusive', culprits: [], probes, runsUsed };
|
|
125
|
+
}
|
|
126
|
+
throw error;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
//# sourceMappingURL=bisect.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bisect.js","sourceRoot":"","sources":["../../../../src/lib/context-bisect/bisect.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAEH,OAAO,EAAE,YAAY,EAAE,gBAAgB,EAAoB,MAAM,eAAe,CAAC;AAEjF,OAAO,EAAE,uBAAuB,EAAE,MAAM,YAAY,CAAC;AACrD,OAAO,EAAE,YAAY,EAAE,MAAM,eAAe,CAAC;AAiD7C,wEAAwE;AAExE,MAAM,mBAAoB,SAAQ,KAAK;IACrC;QACE,KAAK,CAAC,uBAAuB,CAAC,CAAC;IACjC,CAAC;CACF;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,OAA8B;IACjE,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,MAAM,CACxC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,QAAQ,KAAK,SAAS,IAAI,OAAO,CAAC,QAAQ,CAAC,IAAI,KAAK,KAAK,CAC/E,CAAC;IACF,MAAM,SAAS,GAAG,OAAO,CAAC,SAAS,IAAI,uBAAuB,CAAC,SAAS,CAAC;IACzE,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,uBAAuB,CAAC,WAAW,CAAC;IAC/E,MAAM,MAAM,GAAgB,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,QAAQ,EAAE,OAAO,CAAC,QAAQ,EAAE,CAAC;IAEjF,MAAM,MAAM,GAAqB,EAAE,CAAC;IACpC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAmB,CAAC;IACzC,IAAI,QAAQ,GAAG,CAAC,CAAC;IAEjB,MAAM,KAAK,GAAG,CAAC,GAAuB,EAAU,EAAE,CAChD,GAAG;SACA,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;SACvC,IAAI,EAAE;SACN,IAAI,CAAC,GAAG,CAAC,CAAC;IAEf,KAAK,UAAU,KAAK,CAAC,GAAuB;QAC1C,MAAM,GAAG,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC;QACvB,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAC9B,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,MAAM,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,IAAI,SAAS;YAAE,MAAM,IAAI,mBAAmB,EAAE,CAAC;QAChE,MAAM,KAAK,GAAG,MAAM,gBAAgB,CAClC,MAAM,EACN,GAAG,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,QAAQ,KAAK,SAAS,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CACrF,CAAC;QACF,QAAQ,IAAI,KAAK,CAAC,OAAO,CAAC;QAC1B,MAAM,OAAO,GAAG,YAAY,CAAC,KAAK,CAAC,CAAC;QACpC,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC,CAAC;QACtF,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QACxB,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;;;;OAKG;IACH,KAAK,UAAU,QAAQ,CACrB,GAAuB,EACvB,OAA2B;QAE3B,IAAI,GAAG,CAAC,MAAM,IAAI,CAAC;YAAE,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACtC,MAAM,GAAG,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,wCAAwC;QACvE,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC5B,IAAI,MAAM,KAAK,CAAC,CAAC,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,CAAC;YAAE,OAAO,QAAQ,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QACrE,IAAI,MAAM,KAAK,CAAC,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,CAAC;YAAE,OAAO,QAAQ,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACvE,yDAAyD;QACzD,MAAM,OAAO,GAAG,MAAM,QAAQ,CAAC,GAAG,EAAE,CAAC,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC;QAC3D,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC,GAAG,OAAO,EAAE,GAAG,OAAO,CAAC,CAAC,CAAC;QAChE,OAAO,CAAC,GAAG,OAAO,EAAE,GAAG,QAAQ,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,CAAC;QACH,oEAAoE;QACpE,wEAAwE;QACxE,wEAAwE;QACxE,uEAAuE;QACvE,yEAAyE;QACzE,CAAC;YACC,MAAM,aAAa,GAAG,MAAM,gBAAgB,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;YACzD,QAAQ,IAAI,aAAa,CAAC,OAAO,CAAC;YAClC,MAAM,QAAQ,GAAG,aAAa,CAAC,KAAK,GAAG,CAAC,CAAC;YACzC,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,KAAK,EAAE,aAAa,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC;YACtE,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,EAAE,YAAY,CAAC,aAAa,CAAC,CAAC,CAAC;YAClD,IAAI,QAAQ,EAAE,CAAC;gBACb,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;YACrE,CAAC;QACH,CAAC;QACD,2DAA2D;QAC3D,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC;YAC1D,OAAO,EAAE,OAAO,EAAE,kBAAkB,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;QACzE,CAAC;QAED,2DAA2D;QAC3D,gDAAgD;QAChD,MAAM,QAAQ,GAAgB,EAAE,CAAC;QACjC,IAAI,SAAS,GAAG,UAAU,CAAC;QAC3B,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,WAAW,EAAE,KAAK,EAAE,EAAE,CAAC;YACjD,MAAM,KAAK,GAAG,MAAM,QAAQ,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YAC5C,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,MAAM,SAAS,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YACzE,SAAS,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;YACjF,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,SAAS,CAAC,CAAC;gBAAE,MAAM;QACjE,CAAC;QACD,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;IAC9D,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,mBAAmB,EAAE,CAAC;YACzC,OAAO,EAAE,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;QACrE,CAAC;QACD,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* context-bisect — RFC-003 Part B: the contextual-bug LOCALIZER,
|
|
3
|
+
* "git bisect for context".
|
|
4
|
+
*
|
|
5
|
+
* Assembly over shipped pieces: footprintjs 9.8.0's complete causal DAG
|
|
6
|
+
* (control edges, honesty markers, `EdgeWeigher` hook) × influence-core
|
|
7
|
+
* scoring (D6) × consumer-run counterfactual ablation.
|
|
8
|
+
*
|
|
9
|
+
* D7 — `llmEdgeWeigher` influence-weighted LLM-call slice edges
|
|
10
|
+
* D8 — `localizeContextBug` trigger → slice → ranked suspects → ablation
|
|
11
|
+
* D9 — `bisectCulprits` seeded multi-culprit bisection + variance
|
|
12
|
+
*
|
|
13
|
+
* §B2 claim tiers (spelled out on every type): weights/scores are
|
|
14
|
+
* embedding-geometry PROXIES; ablation verdicts are the ONLY causal
|
|
15
|
+
* claims; slice completeness is bounded by tracking — and says so.
|
|
16
|
+
*/
|
|
17
|
+
export { llmEdgeWeigher, stepOutputText, } from './llmEdgeWeigher.js';
|
|
18
|
+
export { defaultSuspectClassifier, formatContextBugReport, llmCallIdsFromEvents, localizeContextBug, suspectLabel, } from './localize.js';
|
|
19
|
+
export { ablationForSuspect, applyAblations, defaultOutcomeComparator, probeFlipped, runAblationProbe, verdictFor, } from './ablation.js';
|
|
20
|
+
export { bisectCulprits, } from './bisect.js';
|
|
21
|
+
export { CONTEXT_BISECT_DEFAULTS, } from './types.js';
|
|
22
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../src/lib/context-bisect/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,EACL,cAAc,EACd,cAAc,GAIf,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EACL,wBAAwB,EACxB,sBAAsB,EACtB,oBAAoB,EACpB,kBAAkB,EAClB,YAAY,GAKb,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,kBAAkB,EAClB,cAAc,EACd,wBAAwB,EACxB,YAAY,EACZ,gBAAgB,EAChB,UAAU,GAGX,MAAM,eAAe,CAAC;AAEvB,OAAO,EACL,cAAc,GAIf,MAAM,aAAa,CAAC;AAErB,OAAO,EACL,uBAAuB,GAoBxB,MAAM,YAAY,CAAC"}
|
|
Binary file
|