agentfootprint 6.24.0 → 6.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/bin/agentfootprint-lint-tools.mjs +14 -0
- package/dist/esm/lib/context-bisect/ablation.js +183 -0
- package/dist/esm/lib/context-bisect/ablation.js.map +1 -0
- package/dist/esm/lib/context-bisect/bisect.js +129 -0
- package/dist/esm/lib/context-bisect/bisect.js.map +1 -0
- package/dist/esm/lib/context-bisect/index.js +22 -0
- package/dist/esm/lib/context-bisect/index.js.map +1 -0
- package/dist/esm/lib/context-bisect/llmEdgeWeigher.js +0 -0
- package/dist/esm/lib/context-bisect/llmEdgeWeigher.js.map +1 -0
- package/dist/esm/lib/context-bisect/localize.js +555 -0
- package/dist/esm/lib/context-bisect/localize.js.map +1 -0
- package/dist/esm/lib/context-bisect/types.js +56 -0
- package/dist/esm/lib/context-bisect/types.js.map +1 -0
- package/dist/esm/lib/tool-lint/analyze.js +235 -0
- package/dist/esm/lib/tool-lint/analyze.js.map +1 -0
- package/dist/esm/lib/tool-lint/cli.js +198 -0
- package/dist/esm/lib/tool-lint/cli.js.map +1 -0
- package/dist/esm/lib/tool-lint/format.js +61 -0
- package/dist/esm/lib/tool-lint/format.js.map +1 -0
- package/dist/esm/lib/tool-lint/index.js +23 -0
- package/dist/esm/lib/tool-lint/index.js.map +1 -0
- package/dist/esm/lib/tool-lint/rules.js +249 -0
- package/dist/esm/lib/tool-lint/rules.js.map +1 -0
- package/dist/esm/lib/tool-lint/types.js +25 -0
- package/dist/esm/lib/tool-lint/types.js.map +1 -0
- package/dist/esm/observe.js +20 -0
- package/dist/esm/observe.js.map +1 -1
- package/dist/esm/recorders/observability/ToolChoiceRecorder.js +261 -0
- package/dist/esm/recorders/observability/ToolChoiceRecorder.js.map +1 -0
- package/dist/lib/context-bisect/ablation.js +192 -0
- package/dist/lib/context-bisect/ablation.js.map +1 -0
- package/dist/lib/context-bisect/bisect.js +133 -0
- package/dist/lib/context-bisect/bisect.js.map +1 -0
- package/dist/lib/context-bisect/index.js +40 -0
- package/dist/lib/context-bisect/index.js.map +1 -0
- package/dist/lib/context-bisect/llmEdgeWeigher.js +0 -0
- package/dist/lib/context-bisect/llmEdgeWeigher.js.map +1 -0
- package/dist/lib/context-bisect/localize.js +563 -0
- package/dist/lib/context-bisect/localize.js.map +1 -0
- package/dist/lib/context-bisect/types.js +59 -0
- package/dist/lib/context-bisect/types.js.map +1 -0
- package/dist/lib/tool-lint/analyze.js +242 -0
- package/dist/lib/tool-lint/analyze.js.map +1 -0
- package/dist/lib/tool-lint/cli.js +203 -0
- package/dist/lib/tool-lint/cli.js.map +1 -0
- package/dist/lib/tool-lint/format.js +65 -0
- package/dist/lib/tool-lint/format.js.map +1 -0
- package/dist/lib/tool-lint/index.js +43 -0
- package/dist/lib/tool-lint/index.js.map +1 -0
- package/dist/lib/tool-lint/rules.js +256 -0
- package/dist/lib/tool-lint/rules.js.map +1 -0
- package/dist/lib/tool-lint/types.js +26 -0
- package/dist/lib/tool-lint/types.js.map +1 -0
- package/dist/observe.js +56 -1
- package/dist/observe.js.map +1 -1
- package/dist/recorders/observability/ToolChoiceRecorder.js +266 -0
- package/dist/recorders/observability/ToolChoiceRecorder.js.map +1 -0
- package/dist/types/lib/context-bisect/ablation.d.ts +97 -0
- package/dist/types/lib/context-bisect/ablation.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/bisect.d.ts +76 -0
- package/dist/types/lib/context-bisect/bisect.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/index.d.ts +22 -0
- package/dist/types/lib/context-bisect/index.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/llmEdgeWeigher.d.ts +125 -0
- package/dist/types/lib/context-bisect/llmEdgeWeigher.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/localize.d.ts +119 -0
- package/dist/types/lib/context-bisect/localize.d.ts.map +1 -0
- package/dist/types/lib/context-bisect/types.d.ts +356 -0
- package/dist/types/lib/context-bisect/types.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/analyze.d.ts +84 -0
- package/dist/types/lib/tool-lint/analyze.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/cli.d.ts +44 -0
- package/dist/types/lib/tool-lint/cli.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/format.d.ts +19 -0
- package/dist/types/lib/tool-lint/format.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/index.d.ts +24 -0
- package/dist/types/lib/tool-lint/index.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/rules.d.ts +86 -0
- package/dist/types/lib/tool-lint/rules.d.ts.map +1 -0
- package/dist/types/lib/tool-lint/types.d.ts +156 -0
- package/dist/types/lib/tool-lint/types.d.ts.map +1 -0
- package/dist/types/observe.d.ts +3 -0
- package/dist/types/observe.d.ts.map +1 -1
- package/dist/types/recorders/observability/ToolChoiceRecorder.d.ts +165 -0
- package/dist/types/recorders/observability/ToolChoiceRecorder.d.ts.map +1 -0
- package/package.json +4 -2
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* context-bisect types — RFC-003 Part B: the contextual-bug localizer
|
|
3
|
+
* ("git bisect for context").
|
|
4
|
+
*
|
|
5
|
+
* Pattern: assembly contract. Part B is pure ASSEMBLY over shipped pieces:
|
|
6
|
+
* footprintjs 9.8.0's complete causal DAG (control edges, honesty
|
|
7
|
+
* markers, `EdgeWeigher` hook) × influence-core scoring (D6) ×
|
|
8
|
+
* consumer-run counterfactual ablation. No new engine features,
|
|
9
|
+
* no new typed events.
|
|
10
|
+
* Role: `src/lib/context-bisect/` leaf. Exported via
|
|
11
|
+
* `agentfootprint/observe`.
|
|
12
|
+
*
|
|
13
|
+
* ## The two-tier honest-claims discipline (RFC-003 §B2)
|
|
14
|
+
*
|
|
15
|
+
* Every number in these types belongs to exactly ONE of two tiers, and the
|
|
16
|
+
* docs say which:
|
|
17
|
+
*
|
|
18
|
+
* - **CORRELATIONAL** — edge weights, suspect scores, rankings. These are
|
|
19
|
+
* deterministic embedding-geometry PROXIES (influence-core composite:
|
|
20
|
+
* semantic alignment between what a source wrote and what the LLM step
|
|
21
|
+
* produced). They mean "high semantic alignment", never "the model
|
|
22
|
+
* answered BECAUSE of this". A report without reruns stops here and is
|
|
23
|
+
* marked `mode: 'correlational'`.
|
|
24
|
+
*
|
|
25
|
+
* - **CAUSAL** — ablation verdicts ONLY. A suspect earns `verdict:
|
|
26
|
+
* 'confirmed'` exclusively by counterfactual evidence: the consumer's
|
|
27
|
+
* `AblationRunner` re-ran the scenario WITHOUT the suspect N seeded
|
|
28
|
+
* times and the outcome flipped (with baseline stability checked and
|
|
29
|
+
* variance reported — never a single-run verdict).
|
|
30
|
+
*
|
|
31
|
+
* Slice completeness is bounded by tracking — and SAYS so: untracked reads
|
|
32
|
+
* (`$getArgs()` / env / silent reads), missing control-dependence lookups,
|
|
33
|
+
* missing read tracking, and depth/node truncation all surface as
|
|
34
|
+
* `honestyFlags` on the report, mirrored from footprintjs's own A2/A4
|
|
35
|
+
* markers.
|
|
36
|
+
*/
|
|
37
|
+
import type { CommitBundle, RuntimeSnapshot, StageSnapshot } from 'footprintjs/advanced';
|
|
38
|
+
import type { ControlDepLookup } from 'footprintjs/trace';
|
|
39
|
+
import type { Embedder, InfluenceWeights } from '../influence-core/index.js';
|
|
40
|
+
/**
|
|
41
|
+
* Minimal structural envelope for captured typed events — satisfied by
|
|
42
|
+
* `AgentfootprintEvent` (collect with `agent.on('*', e => events.push(e))`).
|
|
43
|
+
* Structural so a consumer can hand in any array shaped like this.
|
|
44
|
+
*/
|
|
45
|
+
export interface CapturedEventLike {
|
|
46
|
+
readonly type: string;
|
|
47
|
+
readonly payload: unknown;
|
|
48
|
+
readonly meta: {
|
|
49
|
+
readonly runtimeStageId: string;
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
/**
|
|
53
|
+
* Minimal per-step quality lookup for trigger derivation — satisfied by
|
|
54
|
+
* footprintjs's `QualityRecorder` (structural, decoupled).
|
|
55
|
+
*/
|
|
56
|
+
export interface QualityTriggerLookup {
|
|
57
|
+
getLowest(): {
|
|
58
|
+
runtimeStageId: string;
|
|
59
|
+
entry: {
|
|
60
|
+
score: number;
|
|
61
|
+
stageName: string;
|
|
62
|
+
};
|
|
63
|
+
} | undefined;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* The frozen evidence of one completed run — a structural SUPERSET of the
|
|
67
|
+
* trace-toolpack's `TraceToolpackArtifacts`, so the same bag literal can
|
|
68
|
+
* serve `traceToolpack(...)` and `localizeContextBug(...)`: every
|
|
69
|
+
* runtimeStageId in the report drills straight into the toolpack tools.
|
|
70
|
+
*
|
|
71
|
+
* - `snapshot` — `executor.getSnapshot()` / `agent.getLastSnapshot()`.
|
|
72
|
+
* - `controlDeps` — OPTIONAL `controlDepRecorder().asLookup()` from the
|
|
73
|
+
* run. With it, the slice includes `[control: <rule label>]` edges to
|
|
74
|
+
* the deciders that routed execution. Without it, the report carries the
|
|
75
|
+
* `no-control-deps` honesty flag.
|
|
76
|
+
* - `quality` — OPTIONAL `QualityRecorder` from the run; its
|
|
77
|
+
* lowest-scoring step is the default trigger when `atStep` is absent.
|
|
78
|
+
* - `events` — OPTIONAL captured typed events; used to extract LLM-call
|
|
79
|
+
* step ids (`stream.llm_start`) when `llmCallIds` is not given.
|
|
80
|
+
* - `llmCallIds` — explicit override: the runtimeStageIds of LLM-call
|
|
81
|
+
* executions (the steps whose parent edges D7 weighs).
|
|
82
|
+
*/
|
|
83
|
+
export interface ContextBugArtifacts {
|
|
84
|
+
readonly snapshot: RuntimeSnapshot;
|
|
85
|
+
readonly controlDeps?: ControlDepLookup;
|
|
86
|
+
readonly quality?: QualityTriggerLookup;
|
|
87
|
+
readonly events?: readonly CapturedEventLike[];
|
|
88
|
+
readonly llmCallIds?: readonly string[];
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* What kind of context source a suspect is — picks which ablation adapter
|
|
92
|
+
* applies. `'stage'` is the honest fallback for slice nodes the classifier
|
|
93
|
+
* cannot map to an ablatable source (pipeline plumbing, plain stages).
|
|
94
|
+
*/
|
|
95
|
+
export type SuspectKind = 'tool' | 'injection' | 'memory' | 'arg' | 'stage';
|
|
96
|
+
/** Kind-specific identity + the text the semantic refinement embedded. */
|
|
97
|
+
export interface SuspectDetail {
|
|
98
|
+
/** Tool name (kind 'tool'). */
|
|
99
|
+
readonly toolName?: string;
|
|
100
|
+
/** Injection id (kind 'injection' / 'memory') — `Injection.id`. */
|
|
101
|
+
readonly injectionId?: string;
|
|
102
|
+
/** Injection flavor (fact / skill / rag / memory / …), when known. */
|
|
103
|
+
readonly flavor?: string;
|
|
104
|
+
/**
|
|
105
|
+
* The suspect's own content text (tool result / injection rawContent),
|
|
106
|
+
* already redaction-scrubbed by footprintjs at commit time. This is
|
|
107
|
+
* what the embedder saw for `semanticScore`.
|
|
108
|
+
*/
|
|
109
|
+
readonly text?: string;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* One hop of a suspect's evidence path — the argmax-weight chain from the
|
|
113
|
+
* trigger step back to the suspect. Control edges carry `kind: 'control'`
|
|
114
|
+
* and (when decide() supplied one) the rule label in `key`.
|
|
115
|
+
*/
|
|
116
|
+
export interface EdgePathStep {
|
|
117
|
+
/** Child (downstream) step. The first hop's `from` is the trigger. */
|
|
118
|
+
readonly from: string;
|
|
119
|
+
readonly fromName: string;
|
|
120
|
+
/** Parent (upstream) step. The last hop's `to` is the suspect. */
|
|
121
|
+
readonly to: string;
|
|
122
|
+
readonly toName: string;
|
|
123
|
+
readonly kind: 'data' | 'control';
|
|
124
|
+
/** State key (data) or decide() rule label (control), when present. */
|
|
125
|
+
readonly key?: string;
|
|
126
|
+
/** Edge weight — 1.0 unless D7 weighed it (child was an LLM call). */
|
|
127
|
+
readonly weight: number;
|
|
128
|
+
}
|
|
129
|
+
/** Per-run similarity statistics — variance ALWAYS reported (D9). */
|
|
130
|
+
export interface SimilarityStats {
|
|
131
|
+
readonly mean: number;
|
|
132
|
+
readonly min: number;
|
|
133
|
+
readonly max: number;
|
|
134
|
+
/** Population standard deviation across the N seeded reruns. */
|
|
135
|
+
readonly stdev: number;
|
|
136
|
+
}
|
|
137
|
+
/** Evidence from N seeded ablation reruns of one probe. */
|
|
138
|
+
export interface AblationRunStats {
|
|
139
|
+
/** Seeded reruns performed (the consumer's runner was called N times). */
|
|
140
|
+
readonly samples: number;
|
|
141
|
+
/** Runs where `outcomeChanged(original, ablated)` was true. */
|
|
142
|
+
readonly flips: number;
|
|
143
|
+
/** Embedding similarity of each ablated output to the original. */
|
|
144
|
+
readonly similarity: SimilarityStats;
|
|
145
|
+
}
|
|
146
|
+
export type AblationVerdictKind = 'confirmed' | 'not-confirmed' | 'inconclusive';
|
|
147
|
+
/**
|
|
148
|
+
* The ONLY causal claim in the report (§B2). `'confirmed'` = ablating the
|
|
149
|
+
* suspect flipped the outcome in a MAJORITY of N seeded reruns while the
|
|
150
|
+
* un-ablated baseline stayed stable. `'inconclusive'` = mixed flips, or an
|
|
151
|
+
* unstable baseline (the scenario itself doesn't reproduce — no ablation
|
|
152
|
+
* verdict is trustworthy then). `'not-confirmed'` = no flip observed; the
|
|
153
|
+
* suspect's score remains a correlational proxy only.
|
|
154
|
+
*/
|
|
155
|
+
export interface AblationVerdict {
|
|
156
|
+
readonly verdict: AblationVerdictKind;
|
|
157
|
+
/** Human-readable claim, phrased at the right tier (causal vs proxy). */
|
|
158
|
+
readonly claim: string;
|
|
159
|
+
}
|
|
160
|
+
/** One ranked suspect. */
|
|
161
|
+
export interface Suspect {
|
|
162
|
+
/**
|
|
163
|
+
* runtimeStageId of the slice node this suspect lives at — drillable
|
|
164
|
+
* with the trace-toolpack tools (`trace_node(source)` etc.).
|
|
165
|
+
*/
|
|
166
|
+
readonly source: string;
|
|
167
|
+
readonly stageName: string;
|
|
168
|
+
readonly kind: SuspectKind;
|
|
169
|
+
readonly detail?: SuspectDetail;
|
|
170
|
+
/**
|
|
171
|
+
* The ranking key — CORRELATIONAL proxy (§B2):
|
|
172
|
+
* `structuralScore × semanticScore` when a semantic refinement exists,
|
|
173
|
+
* else `structuralScore`. Means "semantically aligned and causally
|
|
174
|
+
* UPSTREAM", never "caused".
|
|
175
|
+
*
|
|
176
|
+
* Comparison caveat: a suspect WITHOUT content text (kind 'stage'/'arg',
|
|
177
|
+
* or a path through control edges only) keeps its bare structural score
|
|
178
|
+
* — an UPPER BOUND with no content evidence behind it. Plumbing can
|
|
179
|
+
* legitimately rank above ablatable sources; the ablation verdicts (and
|
|
180
|
+
* `semanticScore`'s presence) are what disambiguate.
|
|
181
|
+
*/
|
|
182
|
+
readonly score: number;
|
|
183
|
+
/**
|
|
184
|
+
* Max-product of edge weights along the best path from the trigger to
|
|
185
|
+
* this node (1.0 when no D7-weighted LLM edge is on the path).
|
|
186
|
+
*/
|
|
187
|
+
readonly structuralScore: number;
|
|
188
|
+
/**
|
|
189
|
+
* Influence-core composite of the suspect's own content vs the trigger
|
|
190
|
+
* step's output (clamped to [0, 1]); only for suspects with a known
|
|
191
|
+
* content text. The same proxy disclaimers as D6 apply.
|
|
192
|
+
*/
|
|
193
|
+
readonly semanticScore?: number;
|
|
194
|
+
/**
|
|
195
|
+
* TRUE when `score` includes a content signal (semanticScore present).
|
|
196
|
+
* FALSE = path-only structural score — an UPPER BOUND that can reach 1.0
|
|
197
|
+
* through control-edge paths alone; rank such suspects with care and
|
|
198
|
+
* prefer ablation verdicts to disambiguate. (Machine-readable twin of
|
|
199
|
+
* the "path only — no content signal" report marking.)
|
|
200
|
+
*/
|
|
201
|
+
readonly hasContentEvidence: boolean;
|
|
202
|
+
/** Evidence path, trigger → … → suspect, control edges labeled. */
|
|
203
|
+
readonly edgePath: readonly EdgePathStep[];
|
|
204
|
+
/** The counterfactual to run — absent for kind 'stage'. */
|
|
205
|
+
readonly ablation?: AblationSpec;
|
|
206
|
+
/** CAUSAL tier — present only when an `AblationRunner` was supplied. */
|
|
207
|
+
readonly verdict?: AblationVerdict;
|
|
208
|
+
/** The rerun evidence behind `verdict`. */
|
|
209
|
+
readonly runs?: AblationRunStats;
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* What to remove for one counterfactual rerun. The library BUILDS specs
|
|
213
|
+
* (one per suspect kind) and provides `applyAblations` to apply them to
|
|
214
|
+
* the inputs an agent is constructed from; the consumer's `AblationRunner`
|
|
215
|
+
* re-runs the scenario with the spec applied.
|
|
216
|
+
*
|
|
217
|
+
* THE TOOL SEAM (documented because `AgentOptions` has no `ignoredTools`):
|
|
218
|
+
* agentfootprint has no runtime tool kill-switch — tools enter an agent at
|
|
219
|
+
* construction (`.tools([...])` / injection `inject.tools`). Tool ablation
|
|
220
|
+
* therefore happens where tools are DECLARED: the runner rebuilds the
|
|
221
|
+
* agent with `applyAblations(specs, { tools }).tools`. The same pattern
|
|
222
|
+
* covers injections (rebuild without the excluded `Injection.id`s) and
|
|
223
|
+
* memory (filter `MemoryEntry`s by id before attaching).
|
|
224
|
+
*/
|
|
225
|
+
export type AblationSpec =
|
|
226
|
+
/** Drop these tools from the catalog the agent is built with. */
|
|
227
|
+
{
|
|
228
|
+
readonly kind: 'tool';
|
|
229
|
+
readonly ignoredTools: readonly string[];
|
|
230
|
+
}
|
|
231
|
+
/** Drop these injections (facts / skills / instructions / steering / rag). */
|
|
232
|
+
| {
|
|
233
|
+
readonly kind: 'injection';
|
|
234
|
+
readonly excludeInjectionIds: readonly string[];
|
|
235
|
+
}
|
|
236
|
+
/** Drop these memory entries (matched by `MemoryEntry.id`). */
|
|
237
|
+
| {
|
|
238
|
+
readonly kind: 'memory';
|
|
239
|
+
readonly excludeMemoryIds: readonly string[];
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* The suspect is run INPUT (`$getArgs()` / seed args) — there is nothing
|
|
243
|
+
* the library can filter. The consumer's runner must override the arg
|
|
244
|
+
* itself (e.g. re-run with a neutralized field). `source` names the step
|
|
245
|
+
* that consumed the untracked input.
|
|
246
|
+
*/
|
|
247
|
+
| {
|
|
248
|
+
readonly kind: 'arg';
|
|
249
|
+
readonly source: string;
|
|
250
|
+
readonly note: string;
|
|
251
|
+
};
|
|
252
|
+
/**
|
|
253
|
+
* Consumer-supplied counterfactual runner: re-run the SAME scenario with
|
|
254
|
+
* every spec in `specs` applied, and return the run's output text.
|
|
255
|
+
*
|
|
256
|
+
* Contract:
|
|
257
|
+
* - `specs` may be empty — that is the BASELINE probe (re-run unchanged);
|
|
258
|
+
* its outputs measure the scenario's natural variance.
|
|
259
|
+
* - `run.seed` varies 0..N-1 across the N samples of one probe. Thread it
|
|
260
|
+
* into any stochastic knob (sampling temperature seed, mock script
|
|
261
|
+
* selection) so reruns are deterministic-but-distinct; ignore it for
|
|
262
|
+
* fully deterministic scenarios.
|
|
263
|
+
* - Build a FRESH agent/provider per call — scripted mock providers are
|
|
264
|
+
* stateful (replies consume in order).
|
|
265
|
+
*/
|
|
266
|
+
export type AblationRunner = (specs: readonly AblationSpec[], run: {
|
|
267
|
+
readonly seed: number;
|
|
268
|
+
}) => Promise<string>;
|
|
269
|
+
/**
|
|
270
|
+
* Did the ablated output mean something DIFFERENT from the original?
|
|
271
|
+
* Default: embedding similarity below `flipThreshold`. Override with a
|
|
272
|
+
* domain comparator (e.g. compare extracted decisions) — recommended with
|
|
273
|
+
* `mockEmbedder`, whose cosine compresses prose into ~0.85–0.97 (the C1
|
|
274
|
+
* calibration note: absolute thresholds only with real embedders).
|
|
275
|
+
*/
|
|
276
|
+
export type OutcomeComparator = (original: string, ablated: string) => boolean | Promise<boolean>;
|
|
277
|
+
/** The rerun configuration that upgrades the report to the causal tier. */
|
|
278
|
+
export interface AblationRerun {
|
|
279
|
+
readonly runner: AblationRunner;
|
|
280
|
+
/** The original (buggy) output the reruns are compared against. */
|
|
281
|
+
readonly originalOutput: string;
|
|
282
|
+
/** Seeded reruns per probe. Default 3. Never below 2 (no single-run verdicts — D9). */
|
|
283
|
+
readonly samples?: number;
|
|
284
|
+
/** Outcome-flip comparator. Default: similarity < `flipThreshold`. */
|
|
285
|
+
readonly outcomeChanged?: OutcomeComparator;
|
|
286
|
+
/** Similarity floor for the DEFAULT comparator. Default 0.8. */
|
|
287
|
+
readonly flipThreshold?: number;
|
|
288
|
+
/** Ablate only the top-K ranked suspects that carry a spec. Default 5. */
|
|
289
|
+
readonly maxSuspects?: number;
|
|
290
|
+
}
|
|
291
|
+
/** Slice-shape numbers — how much evidence the ranking stands on. */
|
|
292
|
+
export interface SliceStats {
|
|
293
|
+
readonly nodes: number;
|
|
294
|
+
readonly dataEdges: number;
|
|
295
|
+
readonly controlEdges: number;
|
|
296
|
+
/** Edges that received a D7 (LLM-influence) weight. */
|
|
297
|
+
readonly weightedEdges: number;
|
|
298
|
+
/** Nodes that ALSO consumed untracked sources (args/env/silent reads). */
|
|
299
|
+
readonly incompleteNodes: number;
|
|
300
|
+
readonly maxDepth: number;
|
|
301
|
+
readonly maxNodes: number;
|
|
302
|
+
/** Present when a limit actually cut the slice (footprintjs A4). */
|
|
303
|
+
readonly truncated?: {
|
|
304
|
+
readonly byDepth: boolean;
|
|
305
|
+
readonly byNodes: boolean;
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
export type HonestyFlagKind = 'slice-truncated' | 'untracked-sources' | 'no-control-deps' | 'no-read-tracking' | 'no-llm-call-ids' | 'baseline-unstable';
|
|
309
|
+
export interface HonestyFlag {
|
|
310
|
+
readonly flag: HonestyFlagKind;
|
|
311
|
+
readonly note: string;
|
|
312
|
+
}
|
|
313
|
+
/** The localizer's full output (D8). */
|
|
314
|
+
export interface ContextBugReport {
|
|
315
|
+
/** The trigger step the slice was rooted at. */
|
|
316
|
+
readonly step: string;
|
|
317
|
+
readonly stepName: string;
|
|
318
|
+
/** Where the trigger came from. */
|
|
319
|
+
readonly triggerSource: 'explicit' | 'quality' | 'custom';
|
|
320
|
+
/** The quality score that selected the trigger (quality source only). */
|
|
321
|
+
readonly triggerScore?: number;
|
|
322
|
+
/**
|
|
323
|
+
* `'correlational'` — no `AblationRunner` supplied: the report STOPS at
|
|
324
|
+
* the ranking; every score is a proxy and no causal claim is made.
|
|
325
|
+
* `'causal'` — suspects additionally carry ablation verdicts (§B2: the
|
|
326
|
+
* verdicts are the only causal claims; the scores stay proxies).
|
|
327
|
+
*/
|
|
328
|
+
readonly mode: 'correlational' | 'causal';
|
|
329
|
+
/** Ranked suspects, best (most aligned + upstream) first. */
|
|
330
|
+
readonly suspects: readonly Suspect[];
|
|
331
|
+
readonly sliceStats: SliceStats;
|
|
332
|
+
/** ⚠ everything that bounds what this report can honestly claim. */
|
|
333
|
+
readonly honestyFlags: readonly HonestyFlag[];
|
|
334
|
+
/** Baseline probe stats (causal mode only). */
|
|
335
|
+
readonly baseline?: AblationRunStats;
|
|
336
|
+
}
|
|
337
|
+
export declare const CONTEXT_BISECT_DEFAULTS: {
|
|
338
|
+
/** Slice depth budget (forwarded to `causalChain`). */
|
|
339
|
+
readonly maxDepth: 12;
|
|
340
|
+
/** Slice node budget (forwarded to `causalChain`). */
|
|
341
|
+
readonly maxNodes: 80;
|
|
342
|
+
/** Ranked suspects kept on the report. */
|
|
343
|
+
readonly maxSuspects: 12;
|
|
344
|
+
/** Chars of written content embedded per step text (D7). */
|
|
345
|
+
readonly maxTextChars: 2000;
|
|
346
|
+
/** Seeded reruns per ablation probe (D9 — never single-run verdicts). */
|
|
347
|
+
readonly samples: 3;
|
|
348
|
+
/** Default similarity floor for the default outcome comparator. */
|
|
349
|
+
readonly flipThreshold: 0.8;
|
|
350
|
+
/** Ablation probes budget for `bisectCulprits`. */
|
|
351
|
+
readonly maxProbes: 24;
|
|
352
|
+
/** Independent-culprit search rounds for `bisectCulprits`. */
|
|
353
|
+
readonly maxCulprits: 4;
|
|
354
|
+
};
|
|
355
|
+
export type { CommitBundle, ControlDepLookup, Embedder, InfluenceWeights, RuntimeSnapshot, StageSnapshot, };
|
|
356
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../../src/lib/context-bisect/types.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAmCG;AAEH,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACzF,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,mBAAmB,CAAC;AAC1D,OAAO,KAAK,EAAE,QAAQ,EAAE,gBAAgB,EAAE,MAAM,4BAA4B,CAAC;AAI7E;;;;GAIG;AACH,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,IAAI,EAAE;QAAE,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAA;KAAE,CAAC;CACpD;AAED;;;GAGG;AACH,MAAM,WAAW,oBAAoB;IACnC,SAAS,IAAI;QAAE,cAAc,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE;YAAE,KAAK,EAAE,MAAM,CAAC;YAAC,SAAS,EAAE,MAAM,CAAA;SAAE,CAAA;KAAE,GAAG,SAAS,CAAC;CAClG;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,QAAQ,EAAE,eAAe,CAAC;IACnC,QAAQ,CAAC,WAAW,CAAC,EAAE,gBAAgB,CAAC;IACxC,QAAQ,CAAC,OAAO,CAAC,EAAE,oBAAoB,CAAC;IACxC,QAAQ,CAAC,MAAM,CAAC,EAAE,SAAS,iBAAiB,EAAE,CAAC;IAC/C,QAAQ,CAAC,UAAU,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;CACzC;AAID;;;;GAIG;AACH,MAAM,MAAM,WAAW,GAAG,MAAM,GAAG,WAAW,GAAG,QAAQ,GAAG,KAAK,GAAG,OAAO,CAAC;AAE5E,0EAA0E;AAC1E,MAAM,WAAW,aAAa;IAC5B,+BAA+B;IAC/B,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B,mEAAmE;IACnE,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;IAC9B,sEAAsE;IACtE,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB;;;;OAIG;IACH,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;CACxB;AAED;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC3B,sEAAsE;IACtE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,kEAAkE;IAClE,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS,CAAC;IAClC,uEAAuE;IACvE,QAAQ,CAAC,GAAG,CAAC,EAAE,MAAM,CAAC;IACtB,sEAAsE;IACtE,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;CACzB;AAED,qEAAqE;AACrE,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;CACxB;AAED,2DAA2D;AAC3D,MAAM,WAAW,gBAAgB;IAC/B,0EAA0E;IAC1E,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,+DAA+D;IAC/D,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,mEAAmE;IACnE,QAAQ,CAAC,UAAU,EAAE,eAAe,CAAC;CACtC;AAED,MAAM,MAAM,mBAAmB,GAAG,WAAW,GAAG,eAAe,GAAG,cAAc,CAAC;AAEjF;;;;;;;GAOG;AACH,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,OAAO,EAAE,mBAAmB,CAAC;IACtC,yEAAyE;IACzE,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;CACxB;AAED,0BAA0B;AAC1B,MAAM,WAAW,OAAO;IACtB;;;OAGG;IACH,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAC3B,QAAQ,CAAC,MAAM,CAAC,EAAE,aAAa,CAAC;IAChC;;;;;;;;;;;OAWG;IACH,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB;;;OAGG;IACH,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC;;;;OAIG;IACH,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;IAChC;;;;;;OAMG;IACH,QAAQ,CAAC,kBAAkB,EAAE,OAAO,CAAC;IACrC,mEAAmE;IACnE,QAAQ,CAAC,QAAQ,EAAE,SAAS,YAAY,EAAE,CAAC;IAC3C,2DAA2D;IAC3D,QAAQ,CAAC,QAAQ,CAAC,EAAE,YAAY,CAAC;IACjC,wEAAwE;IACxE,QAAQ,CAAC,OAAO,CAAC,EAAE,eAAe,CAAC;IACnC,2CAA2C;IAC3C,QAAQ,CAAC,IAAI,CAAC,EAAE,gBAAgB,CAAC;CAClC;AAID;;;;;;;;;;;;;GAaG;AACH,MAAM,MAAM,YAAY;AACtB,iEAAiE;AAC/D;IAAE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IAAC,QAAQ,CAAC,YAAY,EAAE,SAAS,MAAM,EAAE,CAAA;CAAE;AACrE,8EAA8E;GAC5E;IAAE,QAAQ,CAAC,IAAI,EAAE,WAAW,CAAC;IAAC,QAAQ,CAAC,mBAAmB,EAAE,SAAS,MAAM,EAAE,CAAA;CAAE;AACjF,+DAA+D;GAC7D;IAAE,QAAQ,CAAC,IAAI,EAAE,QAAQ,CAAC;IAAC,QAAQ,CAAC,gBAAgB,EAAE,SAAS,MAAM,EAAE,CAAA;CAAE;AAC3E;;;;;GAKG;GACD;IAAE,QAAQ,CAAC,IAAI,EAAE,KAAK,CAAC;IAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;IAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC;AAE7E;;;;;;;;;;;;;GAaG;AACH,MAAM,MAAM,cAAc,GAAG,CAC3B,KAAK,EAAE,SAAS,YAAY,EAAE,EAC9B,GAAG,EAAE;IAAE,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAA;CAAE,KAC3B,OAAO,CAAC,MAAM,CAAC,CAAC;AAErB;;;;;;GAMG;AACH,MAAM,MAAM,iBAAiB,GAAG,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,KAAK,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,CAAC;AAElG,2EAA2E;AAC3E,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,MAAM,EAAE,cAAc,CAAC;IAChC,mEAAmE;IACnE,QAAQ,CAAC,cAAc,EAAE,MAAM,CAAC;IAChC,uFAAuF;IACvF,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC;IAC1B,sEAAsE;IACtE,QAAQ,CAAC,cAAc,CAAC,EAAE,iBAAiB,CAAC;IAC5C,gEAAgE;IAChE,QAAQ,CAAC,aAAa,CAAC,EAAE,MAAM,CAAC;IAChC,0EAA0E;IAC1E,QAAQ,CAAC,WAAW,CAAC,EAAE,MAAM,CAAC;CAC/B;AAID,qEAAqE;AACrE,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;IAC3B,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,uDAAuD;IACvD,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,0EAA0E;IAC1E,QAAQ,CAAC,eAAe,EAAE,MAAM,CAAC;IACjC,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,oEAAoE;IACpE,QAAQ,CAAC,SAAS,CAAC,EAAE;QAAE,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;QAAC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAA;KAAE,CAAC;CAC/E;AAED,MAAM,MAAM,eAAe,GACvB,iBAAiB,GACjB,mBAAmB,GACnB,iBAAiB,GACjB,kBAAkB,GAClB,iBAAiB,GACjB,mBAAmB,CAAC;AAExB,MAAM,WAAW,WAAW;IAC1B,QAAQ,CAAC,IAAI,EAAE,eAAe,CAAC;IAC/B,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;CACvB;AAED,wCAAwC;AACxC,MAAM,WAAW,gBAAgB;IAC/B,gDAAgD;IAChD,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;IAC1B,mCAAmC;IACnC,QAAQ,CAAC,aAAa,EAAE,UAAU,GAAG,SAAS,GAAG,QAAQ,CAAC;IAC1D,yEAAyE;IACzE,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;IAC/B;;;;;OAKG;IACH,QAAQ,CAAC,IAAI,EAAE,eAAe,GAAG,QAAQ,CAAC;IAC1C,6DAA6D;IAC7D,QAAQ,CAAC,QAAQ,EAAE,SAAS,OAAO,EAAE,CAAC;IACtC,QAAQ,CAAC,UAAU,EAAE,UAAU,CAAC;IAChC,oEAAoE;IACpE,QAAQ,CAAC,YAAY,EAAE,SAAS,WAAW,EAAE,CAAC;IAC9C,+CAA+C;IAC/C,QAAQ,CAAC,QAAQ,CAAC,EAAE,gBAAgB,CAAC;CACtC;AAID,eAAO,MAAM,uBAAuB;IAClC,uDAAuD;;IAEvD,sDAAsD;;IAEtD,0CAA0C;;IAE1C,4DAA4D;;IAE5D,yEAAyE;;IAEzE,mEAAmE;;IAEnE,mDAAmD;;IAEnD,8DAA8D;;CAEtD,CAAC;AAIX,YAAY,EACV,YAAY,EACZ,gBAAgB,EAChB,QAAQ,EACR,gBAAgB,EAChB,eAAe,EACf,aAAa,GACd,CAAC"}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* analyzeToolCatalog — the tool-catalog confusability lint
|
|
3
|
+
* (RFC-002 block C1, the adoption front door).
|
|
4
|
+
*
|
|
5
|
+
* Pattern: policy layer over `pairwiseSimilarity` (influence-core) — the
|
|
6
|
+
* geometry is computed there; thresholds, verdicts, hints and
|
|
7
|
+
* the structural rule pack live here. Everything is consumer-
|
|
8
|
+
* injectable with our defaults (the plug-and-play meta-pattern).
|
|
9
|
+
* Role: `src/lib/tool-lint/`. ZERO stack buy-in — plain
|
|
10
|
+
* `{ name, description?, inputSchema? }[]` in, report out.
|
|
11
|
+
* `catalogFromTools` adapts the library's own `Tool[]`;
|
|
12
|
+
* `coerceCatalog` (cli.ts) normalizes OpenAI/Anthropic/MCP
|
|
13
|
+
* shapes.
|
|
14
|
+
*
|
|
15
|
+
* ## What is embedded (and why)
|
|
16
|
+
*
|
|
17
|
+
* `confusabilityText(tool)` = tokenized name + ': ' + description. The
|
|
18
|
+
* model differentiates tools by name AND description together, so two
|
|
19
|
+
* tools with near-identical names and overlapping descriptions ARE the
|
|
20
|
+
* confusability case (`get_fcns_database` vs `influx_get_fcns_database`)
|
|
21
|
+
* — embedding only the prose would miss the name signal.
|
|
22
|
+
*
|
|
23
|
+
* ## Calibration (RFC-002 §3 — read this before trusting verdicts)
|
|
24
|
+
*
|
|
25
|
+
* Absolute cosine ranges are PER-EMBEDDER. The default threshold (0.85)
|
|
26
|
+
* is a starting point for real sentence embedders. The test/demo
|
|
27
|
+
* `mockEmbedder` (character-frequency) compresses unrelated prose into
|
|
28
|
+
* ~0.85–0.97 — with it, use `MOCK_EMBEDDER_CALIBRATION` and trust only
|
|
29
|
+
* the RELATIVE ordering in `report.similarity.ranked` (the acceptance
|
|
30
|
+
* fixtures assert ordering, never absolute scores).
|
|
31
|
+
*/
|
|
32
|
+
import type { Tool } from '../../core/tools.js';
|
|
33
|
+
import type { AnalyzeToolCatalogOptions, CatalogTool, ToolCatalogReport } from './types.js';
|
|
34
|
+
/** Default `confusabilityThreshold` — a starting point for REAL sentence
|
|
35
|
+
* embedders (unrelated tool descriptions typically land 0.3–0.7).
|
|
36
|
+
* Calibrate per embedder; meaningless for the mock (see below). */
|
|
37
|
+
export declare const DEFAULT_CONFUSABILITY_THRESHOLD = 0.85;
|
|
38
|
+
/** Default `watchBand` below the threshold. */
|
|
39
|
+
export declare const DEFAULT_WATCH_BAND = 0.05;
|
|
40
|
+
/**
|
|
41
|
+
* Threshold/band calibrated for the char-frequency `mockEmbedder` on
|
|
42
|
+
* realistic tool prose (seed corpus: the Neo SAN catalog). The mock
|
|
43
|
+
* compresses unrelated descriptions into ~0.85–0.97 cosine, so expect
|
|
44
|
+
* false positives even at 0.94 — with the mock, the RELATIVE ordering
|
|
45
|
+
* of `report.similarity.ranked` is the trustworthy signal; absolute
|
|
46
|
+
* verdicts are only honest with a real embedder + per-embedder
|
|
47
|
+
* calibration.
|
|
48
|
+
*/
|
|
49
|
+
export declare const MOCK_EMBEDDER_CALIBRATION: Readonly<{
|
|
50
|
+
confusabilityThreshold: 0.94;
|
|
51
|
+
watchBand: 0.02;
|
|
52
|
+
}>;
|
|
53
|
+
/**
|
|
54
|
+
* Adapt the library's `Tool[]` (from `defineTool` / `Agent.tool`) to the
|
|
55
|
+
* lint's plain catalog shape. Trivial on purpose: `Tool.schema` already
|
|
56
|
+
* IS `{ name, description, inputSchema }`.
|
|
57
|
+
*/
|
|
58
|
+
export declare function catalogFromTools(tools: readonly Tool[]): readonly CatalogTool[];
|
|
59
|
+
/**
|
|
60
|
+
* The text the confusability analysis embeds for one tool: the name with
|
|
61
|
+
* `_`/`-`/camelCase boundaries opened into words, then the description.
|
|
62
|
+
* Exported so consumers can reproduce or replace the construction.
|
|
63
|
+
*/
|
|
64
|
+
export declare function confusabilityText(tool: CatalogTool): string;
|
|
65
|
+
/**
|
|
66
|
+
* Lint a tool catalog: pairwise confusability over what the model reads
|
|
67
|
+
* (when an embedder is supplied) + the structural rule pack. Returns a
|
|
68
|
+
* report whose `ok` is the CI gate.
|
|
69
|
+
*
|
|
70
|
+
* Duplicate tool names are themselves reported as structural errors
|
|
71
|
+
* (rule `duplicate-name`, built-in precondition — a catalog where two
|
|
72
|
+
* tools share a name is broken before any similarity question); the
|
|
73
|
+
* duplicates are dropped from the similarity analysis (first one wins).
|
|
74
|
+
*/
|
|
75
|
+
export declare function analyzeToolCatalog(tools: readonly CatalogTool[], options?: AnalyzeToolCatalogOptions): Promise<ToolCatalogReport>;
|
|
76
|
+
/**
|
|
77
|
+
* Suggest the DIFFERENTIATING AXIS for a flagged pair. Heuristic: when
|
|
78
|
+
* the names are near-twins (≤2 distinct tokens), the qualifier IS the
|
|
79
|
+
* axis — the descriptions must say when to choose each variant. When the
|
|
80
|
+
* names differ, surface the few description terms each tool does NOT
|
|
81
|
+
* share, as the place to anchor an explicit choice condition.
|
|
82
|
+
*/
|
|
83
|
+
export declare function differentiationHint(a: CatalogTool, b: CatalogTool): string;
|
|
84
|
+
//# sourceMappingURL=analyze.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"analyze.d.ts","sourceRoot":"","sources":["../../../../src/lib/tool-lint/analyze.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAE,MAAM,qBAAqB,CAAC;AAGhD,OAAO,KAAK,EACV,yBAAyB,EACzB,WAAW,EAGX,iBAAiB,EAClB,MAAM,YAAY,CAAC;AAEpB;;oEAEoE;AACpE,eAAO,MAAM,+BAA+B,OAAO,CAAC;AAEpD,+CAA+C;AAC/C,eAAO,MAAM,kBAAkB,OAAO,CAAC;AAEvC;;;;;;;;GAQG;AACH,eAAO,MAAM,yBAAyB;;;EAGpC,CAAC;AAEH;;;;GAIG;AACH,wBAAgB,gBAAgB,CAAC,KAAK,EAAE,SAAS,IAAI,EAAE,GAAG,SAAS,WAAW,EAAE,CAM/E;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,WAAW,GAAG,MAAM,CAO3D;AAED;;;;;;;;;GASG;AACH,wBAAsB,kBAAkB,CACtC,KAAK,EAAE,SAAS,WAAW,EAAE,EAC7B,OAAO,GAAE,yBAA8B,GACtC,OAAO,CAAC,iBAAiB,CAAC,CAyF5B;AA4CD;;;;;;GAMG;AACH,wBAAgB,mBAAmB,CAAC,CAAC,EAAE,WAAW,EAAE,CAAC,EAAE,WAAW,GAAG,MAAM,CA6B1E"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool-lint CLI core (RFC-002 block C3 — the CI gate).
|
|
3
|
+
*
|
|
4
|
+
* Pattern: humble shell — `bin/agentfootprint-lint-tools.mjs` is a
|
|
5
|
+
* 3-line wrapper; ALL behavior (arg parsing, catalog coercion,
|
|
6
|
+
* report, exit code) lives here so it is unit-testable without
|
|
7
|
+
* spawning a process.
|
|
8
|
+
* Role: `src/lib/tool-lint/`. Reads ONE JSON file of tools, prints a
|
|
9
|
+
* report, returns the process exit code:
|
|
10
|
+
* 0 — report.ok
|
|
11
|
+
* 1 — findings failed the gate (!ok)
|
|
12
|
+
* 2 — usage / input error (bad flags, unreadable file,
|
|
13
|
+
* unrecognized JSON shape)
|
|
14
|
+
*
|
|
15
|
+
* ## Embedder & gating honesty
|
|
16
|
+
*
|
|
17
|
+
* The CLI has no way to receive a consumer embedder, so it uses the
|
|
18
|
+
* built-in deterministic mock (char-frequency, offline, dependency-free)
|
|
19
|
+
* for the similarity RANKING — and, by default, does NOT gate on it:
|
|
20
|
+
* without `--threshold`, similarity is report-only (relative ordering +
|
|
21
|
+
* watch hints) and the exit code reflects structural findings alone.
|
|
22
|
+
* Pass `--threshold` to make confusable pairs fail the gate — you own
|
|
23
|
+
* the calibration at that point (start from
|
|
24
|
+
* `MOCK_EMBEDDER_CALIBRATION.confusabilityThreshold` = 0.94). For real
|
|
25
|
+
* embedder gating, use `analyzeToolCatalog` from
|
|
26
|
+
* `agentfootprint/observe` in a small script instead.
|
|
27
|
+
*/
|
|
28
|
+
import type { CatalogTool } from './types.js';
|
|
29
|
+
export interface ToolLintCliIO {
|
|
30
|
+
readonly stdout: (line: string) => void;
|
|
31
|
+
readonly stderr: (line: string) => void;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Normalize any of the recognized tool-list JSON shapes to the lint's
|
|
35
|
+
* plain catalog. Throws (with a shape description) on unrecognized
|
|
36
|
+
* input — the CLI maps that to exit code 2.
|
|
37
|
+
*/
|
|
38
|
+
export declare function coerceCatalog(json: unknown): readonly CatalogTool[];
|
|
39
|
+
/**
|
|
40
|
+
* Run the lint CLI. Returns the exit code (never calls `process.exit` —
|
|
41
|
+
* the bin wrapper assigns it to `process.exitCode`).
|
|
42
|
+
*/
|
|
43
|
+
export declare function runToolLintCli(argv: readonly string[], io?: ToolLintCliIO): Promise<number>;
|
|
44
|
+
//# sourceMappingURL=cli.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../../../src/lib/tool-lint/cli.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AAMH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AAE9C,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;IACxC,QAAQ,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,IAAI,CAAC;CACzC;AAqBD;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,OAAO,GAAG,SAAS,WAAW,EAAE,CAuCnE;AAmDD;;;GAGG;AACH,wBAAsB,cAAc,CAClC,IAAI,EAAE,SAAS,MAAM,EAAE,EACvB,EAAE,GAAE,aAKH,GACA,OAAO,CAAC,MAAM,CAAC,CAkDjB"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* formatToolCatalogReport — human-readable rendering of a lint report.
|
|
3
|
+
*
|
|
4
|
+
* Pattern: pure presenter. One report → one string; used verbatim by the
|
|
5
|
+
* CLI (`agentfootprint-lint-tools`) and the examples so output
|
|
6
|
+
* stays byte-identical across surfaces.
|
|
7
|
+
* Role: `src/lib/tool-lint/` leaf. No I/O.
|
|
8
|
+
*/
|
|
9
|
+
import type { ToolCatalogReport } from './types.js';
|
|
10
|
+
export interface FormatReportOptions {
|
|
11
|
+
/** How many ranked pairs to show in the relative-ordering section.
|
|
12
|
+
* Default 10. 0 hides the section. */
|
|
13
|
+
readonly topPairs?: number;
|
|
14
|
+
/** How many WATCH pairs to print before eliding the rest (the report
|
|
15
|
+
* object always carries all of them). Default 10. */
|
|
16
|
+
readonly maxWatch?: number;
|
|
17
|
+
}
|
|
18
|
+
export declare function formatToolCatalogReport(report: ToolCatalogReport, options?: FormatReportOptions): string;
|
|
19
|
+
//# sourceMappingURL=format.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"format.d.ts","sourceRoot":"","sources":["../../../../src/lib/tool-lint/format.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,YAAY,CAAC;AAEpD,MAAM,WAAW,mBAAmB;IAClC;2CACuC;IACvC,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAC3B;0DACsD;IACtD,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,iBAAiB,EACzB,OAAO,GAAE,mBAAwB,GAChC,MAAM,CA6DR"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* tool-lint — the tool-catalog confusability lint (RFC-002 tier 1,
|
|
3
|
+
* blocks C1–C3).
|
|
4
|
+
*
|
|
5
|
+
* Build-time, CI-gateable, framework-agnostic: a plain
|
|
6
|
+
* `{ name, description?, inputSchema? }[]` in (any OpenAI / Anthropic /
|
|
7
|
+
* LangChain / MCP tool list coerces to it), a report with a CI-gateable
|
|
8
|
+
* `ok` out. The embedding geometry comes from influence-core
|
|
9
|
+
* (`pairwiseSimilarity`); this module is the policy layer — thresholds,
|
|
10
|
+
* verdicts, hints, and the pluggable structural rule pack.
|
|
11
|
+
*
|
|
12
|
+
* Surfaces:
|
|
13
|
+
* - `analyzeToolCatalog(tools, opts)` — the API (C1)
|
|
14
|
+
* - `defaultStructuralRules` + rule factories — the rule pack (C2)
|
|
15
|
+
* - `runToolLintCli` / bin `agentfootprint-lint-tools` — the gate (C3)
|
|
16
|
+
*
|
|
17
|
+
* Front-door guide: docs/guides/tool-catalog-lint.md
|
|
18
|
+
*/
|
|
19
|
+
export type { AnalyzeToolCatalogOptions, CatalogTool, ConfusablePairFinding, LintRule, LintSeverity, PairVerdict, SimilarityReport, StructuralFinding, ToolCatalogReport, } from './types.js';
|
|
20
|
+
export { analyzeToolCatalog, catalogFromTools, confusabilityText, differentiationHint, DEFAULT_CONFUSABILITY_THRESHOLD, DEFAULT_WATCH_BAND, MOCK_EMBEDDER_CALIBRATION, } from './analyze.js';
|
|
21
|
+
export { defaultStructuralRules, descriptionRule, enumInProseRule, optionalParamRule, saysWhatNotWhenRule, DEFAULT_OMISSION_CUES, DEFAULT_WHEN_CUES, type DescriptionRuleOptions, type OptionalParamRuleOptions, type SaysWhatNotWhenRuleOptions, } from './rules.js';
|
|
22
|
+
export { formatToolCatalogReport, type FormatReportOptions } from './format.js';
|
|
23
|
+
export { coerceCatalog, runToolLintCli, type ToolLintCliIO } from './cli.js';
|
|
24
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/lib/tool-lint/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,YAAY,EACV,yBAAyB,EACzB,WAAW,EACX,qBAAqB,EACrB,QAAQ,EACR,YAAY,EACZ,WAAW,EACX,gBAAgB,EAChB,iBAAiB,EACjB,iBAAiB,GAClB,MAAM,YAAY,CAAC;AAEpB,OAAO,EACL,kBAAkB,EAClB,gBAAgB,EAChB,iBAAiB,EACjB,mBAAmB,EACnB,+BAA+B,EAC/B,kBAAkB,EAClB,yBAAyB,GAC1B,MAAM,cAAc,CAAC;AAEtB,OAAO,EACL,sBAAsB,EACtB,eAAe,EACf,eAAe,EACf,iBAAiB,EACjB,mBAAmB,EACnB,qBAAqB,EACrB,iBAAiB,EACjB,KAAK,sBAAsB,EAC3B,KAAK,wBAAwB,EAC7B,KAAK,0BAA0B,GAChC,MAAM,YAAY,CAAC;AAEpB,OAAO,EAAE,uBAAuB,EAAE,KAAK,mBAAmB,EAAE,MAAM,aAAa,CAAC;AAEhF,OAAO,EAAE,aAAa,EAAE,cAAc,EAAE,KAAK,aAAa,EAAE,MAAM,UAAU,CAAC"}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structural lint rules (RFC-002 block C2) — the PLUGGABLE RULE PACK.
|
|
3
|
+
*
|
|
4
|
+
* Pattern: Strategy list — each rule is a plain `{ id, check }` object;
|
|
5
|
+
* `defaultStructuralRules` is OUR pack, and consumers add /
|
|
6
|
+
* remove / replace freely via `AnalyzeToolCatalogOptions.rules`.
|
|
7
|
+
* Parameterizable rules ship as FACTORIES (`descriptionRule`,
|
|
8
|
+
* `saysWhatNotWhenRule`, …) returning a configured `LintRule`.
|
|
9
|
+
* Role: `src/lib/tool-lint/` leaf. Pure functions over `CatalogTool`;
|
|
10
|
+
* no embedder, no I/O.
|
|
11
|
+
*
|
|
12
|
+
* Every rule encodes a FIELD FINDING from real catalogs (the Neo SAN
|
|
13
|
+
* triage agent's 29-tool catalog was the seed corpus):
|
|
14
|
+
*
|
|
15
|
+
* 1. description-missing-or-short — the model can only guess from a name.
|
|
16
|
+
* 2. says-what-not-when — describes WHAT the tool returns but gives the
|
|
17
|
+
* model no cue for WHEN to pick it over a sibling (the #1 cause of
|
|
18
|
+
* twin-tool confusion: 'get_fcns_database' vs 'influx_get_fcns_database').
|
|
19
|
+
* 3. enum-in-prose — string params whose legal values are listed in prose
|
|
20
|
+
* ("avg_iops | peak_iops | mbps") instead of a JSON-Schema `enum` the
|
|
21
|
+
* model (and validators, see #9 tool-args validation) can act on.
|
|
22
|
+
* 4. optional-param-undocumented — optional params whose omission has
|
|
23
|
+
* meaning (fabric-wide sweep vs one switch) but whose schema never
|
|
24
|
+
* says so; the model can't reason about leaving them out.
|
|
25
|
+
*
|
|
26
|
+
* Honest claim: these are token/regex HEURISTICS. They flag review
|
|
27
|
+
* prompts, not certainties — expect (rare) false positives and tune via
|
|
28
|
+
* the factory options instead of deleting the rule.
|
|
29
|
+
*/
|
|
30
|
+
import type { LintRule } from './types.js';
|
|
31
|
+
export interface DescriptionRuleOptions {
|
|
32
|
+
/** Descriptions shorter than this (in chars) get a `warn`. Default 40. */
|
|
33
|
+
readonly minChars?: number;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Missing description → `error` (the model can only guess from the
|
|
37
|
+
* name). Present but shorter than `minChars` → `warn` (too short to
|
|
38
|
+
* differentiate from siblings).
|
|
39
|
+
*/
|
|
40
|
+
export declare function descriptionRule(options?: DescriptionRuleOptions): LintRule;
|
|
41
|
+
/** RFC-002 C2 heuristic cue list — temporal/conditional words whose
|
|
42
|
+
* presence suggests the description says WHEN to use the tool. */
|
|
43
|
+
export declare const DEFAULT_WHEN_CUES: readonly string[];
|
|
44
|
+
export interface SaysWhatNotWhenRuleOptions {
|
|
45
|
+
/** Cue tokens (whole-word, case-insensitive). Default `DEFAULT_WHEN_CUES`. */
|
|
46
|
+
readonly cueTokens?: readonly string[];
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* A description with NO temporal/conditional cue token usually describes
|
|
50
|
+
* WHAT the tool returns but never WHEN to pick it — the #1 cause of
|
|
51
|
+
* twin-tool confusion. Heuristic by design: tune `cueTokens` rather than
|
|
52
|
+
* dropping the rule. Skips tools with no description (rule 1's finding).
|
|
53
|
+
*/
|
|
54
|
+
export declare function saysWhatNotWhenRule(options?: SaysWhatNotWhenRuleOptions): LintRule;
|
|
55
|
+
/**
|
|
56
|
+
* A string param whose description enumerates its legal values in prose
|
|
57
|
+
* (pipe-separated literals, or comma lists behind "one of"/"allowed
|
|
58
|
+
* values") should declare a JSON-Schema `enum` instead — the model picks
|
|
59
|
+
* reliably from enums, and arg validators (#9) can enforce them. The
|
|
60
|
+
* field case: Neo's `influx_get_port_ranking.metric` =
|
|
61
|
+
* `"avg_iops | peak_iops | mbps"`.
|
|
62
|
+
*/
|
|
63
|
+
export declare function enumInProseRule(): LintRule;
|
|
64
|
+
/** Words that signal the description DOES say what omission means. */
|
|
65
|
+
export declare const DEFAULT_OMISSION_CUES: readonly string[];
|
|
66
|
+
export interface OptionalParamRuleOptions {
|
|
67
|
+
/** Cue tokens that satisfy the rule. Default `DEFAULT_OMISSION_CUES`. */
|
|
68
|
+
readonly omissionCues?: readonly string[];
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* An optional param's omission usually MEANS something (Neo:
|
|
72
|
+
* `influx_get_interface_counters` without `switch_name` = fabric-wide
|
|
73
|
+
* sweep) — but the model can only reason about leaving a param out if
|
|
74
|
+
* the description says so. No description at all, or one with no
|
|
75
|
+
* omission cue, gets a `warn`.
|
|
76
|
+
*/
|
|
77
|
+
export declare function optionalParamRule(options?: OptionalParamRuleOptions): LintRule;
|
|
78
|
+
/**
|
|
79
|
+
* OUR rule pack, built with default options. Compose your own:
|
|
80
|
+
*
|
|
81
|
+
* rules: [...defaultStructuralRules, myRule] // add
|
|
82
|
+
* rules: defaultStructuralRules.filter(r => r.id !== '…') // remove
|
|
83
|
+
* rules: [descriptionRule({ minChars: 80 }), …] // re-tune
|
|
84
|
+
*/
|
|
85
|
+
export declare const defaultStructuralRules: readonly LintRule[];
|
|
86
|
+
//# sourceMappingURL=rules.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rules.d.ts","sourceRoot":"","sources":["../../../../src/lib/tool-lint/rules.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AAEH,OAAO,KAAK,EAAe,QAAQ,EAAqB,MAAM,YAAY,CAAC;AA2C3E,MAAM,WAAW,sBAAsB;IACrC,0EAA0E;IAC1E,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,OAAO,GAAE,sBAA2B,GAAG,QAAQ,CA6B9E;AAID;mEACmE;AACnE,eAAO,MAAM,iBAAiB,EAAE,SAAS,MAAM,EAO9C,CAAC;AAEF,MAAM,WAAW,0BAA0B;IACzC,8EAA8E;IAC9E,QAAQ,CAAC,SAAS,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;CACxC;AAED;;;;;GAKG;AACH,wBAAgB,mBAAmB,CAAC,OAAO,GAAE,0BAA+B,GAAG,QAAQ,CAqBtF;AAcD;;;;;;;GAOG;AACH,wBAAgB,eAAe,IAAI,QAAQ,CA4B1C;AAgBD,sEAAsE;AACtE,eAAO,MAAM,qBAAqB,EAAE,SAAS,MAAM,EAalD,CAAC;AAEF,MAAM,WAAW,wBAAwB;IACvC,yEAAyE;IACzE,QAAQ,CAAC,YAAY,CAAC,EAAE,SAAS,MAAM,EAAE,CAAC;CAC3C;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,OAAO,GAAE,wBAA6B,GAAG,QAAQ,CA+BlF;AAID;;;;;;GAMG;AACH,eAAO,MAAM,sBAAsB,EAAE,SAAS,QAAQ,EAKrD,CAAC"}
|