@tangle-network/agent-eval 0.71.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/CHANGELOG.md +63 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +485 -9
  11. package/dist/campaign/index.js +618 -30
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
  19. package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
  29. package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
  30. package/dist/contract/index.d.ts +17 -13
  31. package/dist/contract/index.js +14 -8
  32. package/dist/contract/index.js.map +1 -1
  33. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  34. package/dist/control.d.ts +2 -2
  35. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  36. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  37. package/dist/hosted/index.d.ts +223 -2
  38. package/dist/index.d.ts +49 -1323
  39. package/dist/index.js +339 -2627
  40. package/dist/index.js.map +1 -1
  41. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  42. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  43. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  44. package/dist/openapi.json +1 -1
  45. package/dist/pareto-E-pembql.d.ts +81 -0
  46. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  47. package/dist/redact-B40YG2M_.d.ts +45 -0
  48. package/dist/registry-DuVYiTvw.d.ts +128 -0
  49. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  50. package/dist/rl.d.ts +4 -3
  51. package/dist/rl.js +4 -4
  52. package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
  53. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  54. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  55. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  56. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  57. package/dist/traces.d.ts +371 -308
  58. package/dist/traces.js +43 -18
  59. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  60. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  61. package/dist/wire/index.d.ts +1 -1
  62. package/dist/workflow/index.d.ts +494 -0
  63. package/dist/workflow/index.js +2177 -0
  64. package/dist/workflow/index.js.map +1 -0
  65. package/docs/design/self-improvement-roadmap.md +106 -0
  66. package/package.json +36 -12
  67. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  68. package/dist/chunk-6QZUCFKM.js.map +0 -1
  69. package/dist/chunk-ODGETRTM.js.map +0 -1
  70. package/dist/chunk-PQV2TKC3.js +0 -27
  71. package/dist/chunk-PQV2TKC3.js.map +0 -1
  72. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
  73. /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
@@ -0,0 +1,650 @@
1
+ import { AxAIService } from '@ax-llm/ax';
2
+ import { c as TraceAnalystKindSpec } from './kind-factory-DW9XWPvM.js';
3
+ import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-DuVYiTvw.js';
4
+ import { z } from 'zod';
5
+ import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-CRD68aH7.js';
6
+ import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
7
+ import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
8
+ import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';
9
+
10
+ interface CreateAnalystAiConfig {
11
+ /** OpenAI-compatible API key forwarded as `Authorization: Bearer`.
12
+ * cli-bridge ignores the value on loopback but Ax requires a non-empty string. */
13
+ apiKey: string;
14
+ /** OpenAI-compatible base URL — e.g. `https://router.tangle.tools/v1` or a
15
+ * cli-bridge loopback. */
16
+ baseUrl: string;
17
+ /** Model id forwarded to the analyst actor + responder. */
18
+ model: string;
19
+ /** Ax provider name. Defaults to the OpenAI-compatible client. */
20
+ provider?: 'openai' | 'anthropic';
21
+ }
22
+ /**
23
+ * Construct the `AxAIService` an analyst kind calls through
24
+ * (`createTraceAnalystKind({ ai })`).
25
+ *
26
+ * Ax's `ai()` pins `config.model` to the OpenAI catalog enum, but every
27
+ * OpenAI-compatible router an analyst points at (router.tangle.tools,
28
+ * cli-bridge) accepts arbitrary model ids (claude-code/sonnet, openai/gpt-5.4,
29
+ * …). Consumers were each re-rolling `ai({ name, apiKey, apiURL, config })`
30
+ * behind an `as (a: any) => any` cast to dodge the enum; this is the one
31
+ * canonical constructor so they don't have to — and don't take a direct
32
+ * `@ax-llm/ax` dependency for it.
33
+ */
34
+ declare function createAnalystAi(config: CreateAnalystAiConfig): AxAIService;
35
+
36
+ /**
37
+ * `buildDefaultAnalystRegistry` — the canonical analyst suite, so consumers
38
+ * stop hand-wiring `new AnalystRegistry()` + per-kind `createTraceAnalystKind`.
39
+ *
40
+ * The deterministic `behavioralAnalyst` is ALWAYS registered (it needs no
41
+ * model and is model-agnostic by construction). The agentic RLM kinds are
42
+ * registered only when an `ai` service is supplied — so a caller with no LLM
43
+ * still gets the full behavioral/efficiency diagnosis, and the substrate's
44
+ * "any model (including no model)" guarantee holds at the suite level.
45
+ */
46
+
47
+ interface DefaultAnalystRegistryOptions {
48
+ /** Ax service for the agentic RLM kinds. Omit → only the deterministic analyst. */
49
+ ai?: AxAIService;
50
+ /** Model for the agentic kinds (falls back to the ai service default). */
51
+ model?: string;
52
+ /** Which agentic kinds to register when `ai` is present. Default = the shipped suite. */
53
+ kinds?: readonly TraceAnalystKindSpec[];
54
+ /** Set false to omit the deterministic behavioral analyst (default: include). */
55
+ includeBehavioral?: boolean;
56
+ /** Forwarded to the AnalystRegistry constructor (signal, tags, priorFindings). */
57
+ registry?: AnalystRegistryOptions;
58
+ }
59
+ declare function buildDefaultAnalystRegistry(opts?: DefaultAnalystRegistryOptions): AnalystRegistry;
60
+
61
+ /**
62
+ * Typed `FindingSubject` — the canonical grammar every analyst kind emits.
63
+ *
64
+ * Background: kind actor prompts have always documented a subject grammar
65
+ * (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`) but the
66
+ * LLM was unconstrained — it could emit `subject: "fix the prompt"`
67
+ * (prose) and downstream adapters routed on `startsWith(...)` would
68
+ * silently skip it. Every per-vertical `ImprovementAdapter` had a
69
+ * routing table that mostly caught nothing.
70
+ *
71
+ * This module fixes that:
72
+ * - `parseFindingSubject(raw)` — returns the typed `FindingSubject`
73
+ * when `raw` matches the grammar, else `null`. Used at the
74
+ * `RawAnalystFindingSchema` boundary so malformed subjects are
75
+ * rejected loudly instead of silently lifted into the registry.
76
+ * - `FindingSubjectKind` — the union of valid locus categories. Each
77
+ * variant carries the typed components downstream adapters resolve
78
+ * against the agent's surface manifest (no string parsing in the
79
+ * adapter).
80
+ * - `FINDING_SUBJECT_GRAMMAR_PROMPT` — single source of truth for the
81
+ * grammar string embedded in kind actor prompts. Drift between
82
+ * prompt and parser is impossible if every kind imports this.
83
+ *
84
+ * The grammar is intentionally NARROW — only loci the substrate's
85
+ * default `ImprovementAdapter` / `KnowledgeAdapter` can act on. A
86
+ * finding with a subject outside this set fails the parser; the kind
87
+ * author either extends the grammar here (and adds adapter routing)
88
+ * or rephrases the prompt to map onto an existing variant.
89
+ *
90
+ * `failure-mode` is the one exception — its subjects are free-form
91
+ * cluster labels, not loci. The schema preserves them as
92
+ * `{ kind: 'cluster', label }` and the adapters skip them (cluster
93
+ * findings are evidence, not actionable mutations).
94
+ */
95
+
96
+ /**
97
+ * Discriminated union of every locus the substrate can route findings to.
98
+ *
99
+ * Adapters narrow on `kind` and use the typed components (no string
100
+ * parsing). Adding a variant here REQUIRES updating the parser, the
101
+ * grammar prompt, and at least one adapter — by design.
102
+ */
103
+ type FindingSubject = {
104
+ kind: 'knowledge.wiki';
105
+ slug: string;
106
+ heading?: string;
107
+ } | {
108
+ kind: 'knowledge.claim';
109
+ topic: string;
110
+ } | {
111
+ kind: 'knowledge.raw';
112
+ sourceId: string;
113
+ } | {
114
+ kind: 'knowledge.stale';
115
+ slug: string;
116
+ } | {
117
+ kind: 'system-prompt';
118
+ section: string;
119
+ } | {
120
+ kind: 'tool-doc';
121
+ tool: string;
122
+ aspect?: string;
123
+ } | {
124
+ kind: 'new-tool';
125
+ name: string;
126
+ } | {
127
+ kind: 'rag';
128
+ corpus: string;
129
+ docId: string;
130
+ } | {
131
+ kind: 'memory';
132
+ key: string;
133
+ } | {
134
+ kind: 'scaffolding';
135
+ concern: string;
136
+ } | {
137
+ kind: 'output-schema';
138
+ field: string;
139
+ } | {
140
+ kind: 'websearch.outdated';
141
+ topic: string;
142
+ } | {
143
+ kind: 'prior-run-summary';
144
+ topic: string;
145
+ } | {
146
+ kind: 'cluster';
147
+ label: string;
148
+ };
149
+ type FindingSubjectKind = FindingSubject['kind'];
150
+ declare const FINDING_SUBJECT_KINDS: ReadonlyArray<FindingSubjectKind>;
151
+ /**
152
+ * Parse a raw subject string emitted by an analyst kind's actor.
153
+ *
154
+ * Returns the typed `FindingSubject` when `raw` matches the grammar,
155
+ * else `null`. Callers use the `null` return as a signal to either
156
+ * (a) reject the finding at parse time (kinds that emit typed loci —
157
+ * knowledge-gap, improvement, knowledge-poisoning) or (b) lift it as
158
+ * a cluster label (failure-mode).
159
+ *
160
+ * Slugs are constrained to `[a-z0-9-]+` (lowercase kebab) to keep file
161
+ * paths sane downstream. Topics / keys / sections allow any non-empty
162
+ * string (free-form for the LLM's voice) but get trimmed.
163
+ *
164
+ * Empty / whitespace-only inputs return `null`. `undefined` returns
165
+ * `null`. Both are surfaced by the caller as a rejected subject.
166
+ */
167
+ declare function parseFindingSubject(raw: string | null | undefined): FindingSubject | null;
168
+ /**
169
+ * Render the parsed subject back to its canonical string form. Inverse
170
+ * of `parseFindingSubject`; useful when the substrate constructs new
171
+ * findings programmatically (e.g. for tests, replays, or
172
+ * `id_basis` carry-forward).
173
+ */
174
+ declare function renderFindingSubject(s: FindingSubject): string;
175
+ /**
176
+ * The grammar text embedded into kind actor prompts. Kinds opt into
177
+ * the subset of variants they emit (e.g. `improvement` excludes the
178
+ * cluster variant; `failure-mode` includes ONLY the cluster variant).
179
+ *
180
+ * Drift between prompt and parser is impossible: every kind imports
181
+ * this constant + the matching `expects` set, and the unit tests below
182
+ * lock the table to the parser.
183
+ */
184
+ declare const FINDING_SUBJECT_GRAMMAR_PROMPT: string;
185
+ /**
186
+ * The variants each kind is allowed to emit. Used at the kind factory
187
+ * boundary so a knowledge-gap finding can't sneak in a `system-prompt:*`
188
+ * subject (the improvement-analyst's job) and vice versa.
189
+ *
190
+ * `failure-mode` is restricted to `cluster` — the only kind that emits
191
+ * a non-locus subject.
192
+ */
193
+ declare const KIND_EXPECTED_SUBJECTS: Record<string, ReadonlyArray<FindingSubjectKind>>;
194
+ /**
195
+ * Zod schema that validates a raw subject string and returns the parsed
196
+ * `FindingSubject`. Embedded in `RawAnalystFindingSchema` via
197
+ * `transform`, so `subject` arrives at the kind factory either as a
198
+ * typed locus or as a parse error attached to a single Zod issue.
199
+ *
200
+ * Optionality is preserved: subjects ARE optional on the wire (some
201
+ * findings are descriptive, not actionable). When present, they MUST
202
+ * parse — emitting a malformed subject is a contract violation, not a
203
+ * soft signal.
204
+ */
205
+ declare const FindingSubjectStringSchema: z.ZodString;
206
+
207
+ /**
208
+ * FindingsStore — durable persistence for AnalystFinding rows + a diff
209
+ * helper so we can answer "what changed since the last run?" without
210
+ * recomputing analysts.
211
+ *
212
+ * On-disk shape is JSONL: one finding per line, append-only, locked via
213
+ * LockedJsonlAppender. Operators get crash-safety (no partial JSON),
214
+ * cheap reads (sequential parse), and trivial backup (rsync the file).
215
+ *
216
+ * Reads are non-locking: a reader sees a consistent snapshot of all
217
+ * fully-written lines and skips an incomplete trailing line if the
218
+ * writer is mid-append. Cross-process locking is intentionally out of
219
+ * scope (see locked-jsonl-appender.ts).
220
+ *
221
+ * The store is run-scoped: callers pass `runId` on append and on load,
222
+ * which keeps multi-run files cleanly partitioned. The `diffFindings`
223
+ * helper compares two run-id sets using stable `finding_id` semantics —
224
+ * the diff is the cross-run signal the regression dashboard renders.
225
+ */
226
+
227
+ /**
228
+ * One persisted row. We attach `run_id` on disk so a single file can
229
+ * hold multiple runs and the diff helper can query without re-walking
230
+ * separate files.
231
+ */
232
+ interface PersistedFinding extends AnalystFinding {
233
+ run_id: string;
234
+ }
235
+ declare class FindingsStore {
236
+ readonly path: string;
237
+ private readonly appender;
238
+ constructor(path: string);
239
+ append(runId: string, findings: AnalystFinding[]): Promise<void>;
240
+ /** Load every persisted finding. Discards malformed trailing lines silently. */
241
+ loadAll(): PersistedFinding[];
242
+ /** Filter to a single run. */
243
+ loadRun(runId: string): PersistedFinding[];
244
+ }
245
+ interface FindingsDiff {
246
+ /** New finding ids in `current` that weren't in `previous`. */
247
+ appeared: PersistedFinding[];
248
+ /** Finding ids in `previous` that aren't in `current`. */
249
+ disappeared: PersistedFinding[];
250
+ /** Same finding id present in both runs and unchanged per the materiality test. */
251
+ persisted: PersistedFinding[];
252
+ /**
253
+ * Same finding id in both runs but at least one non-identity field
254
+ * shifted per `DiffPolicy.isMaterial`. Reported as [previous, current].
255
+ */
256
+ changed: Array<{
257
+ previous: PersistedFinding;
258
+ current: PersistedFinding;
259
+ }>;
260
+ }
261
+ interface DiffPolicy {
262
+ /**
263
+ * Predicate that decides whether two findings (same finding_id) count
264
+ * as a material change. Defaults to {@link defaultIsMaterial}: severity
265
+ * shift, confidence Δ > 0.05, or evidence count change. Compliance /
266
+ * perf consumers MAY supply a stricter predicate (e.g. rationale text
267
+ * diff, metric Δ thresholds).
268
+ */
269
+ isMaterial?: (previous: AnalystFinding, current: AnalystFinding) => boolean;
270
+ }
271
+ /**
272
+ * Default materiality test. Deliberately narrow so LLM-reword churn
273
+ * doesn't flood the diff. Stricter tests are opt-in via DiffPolicy.
274
+ */
275
+ declare function defaultIsMaterial(a: AnalystFinding, b: AnalystFinding): boolean;
276
+ /**
277
+ * Diff two findings sets by stable finding_id. Callers typically load
278
+ * the two run-id slices from the same store and pass them in.
279
+ */
280
+ declare function diffFindings(previous: PersistedFinding[], current: PersistedFinding[], policy?: DiffPolicy): FindingsDiff;
281
+
282
+ /**
283
+ * Failure-mode analyst — classifies what went wrong and why.
284
+ *
285
+ * Brief: read the trace dataset, identify the top failure modes across
286
+ * runs, classify each with severity + evidence, and surface them as
287
+ * findings. The actor's job is *taxonomy + evidence*, not fix-design —
288
+ * that's the improvement-analyst's job.
289
+ *
290
+ * Recursion is deep (`maxDepth: 3`) because real failure-mode
291
+ * discovery is genuinely tree-shaped: the actor splits the dataset
292
+ * into candidate clusters, each cluster spawns a focused investigator
293
+ * that drills into representative traces, and a deeply-recursed
294
+ * investigator may itself split a confounded mode into two sub-modes.
295
+ * Each level fans out 4-way, so the analyst can investigate up to
296
+ * ~16 leaf clusters before hitting the depth ceiling.
297
+ */
298
+
299
+ declare const FAILURE_MODE_KIND_SPEC: TraceAnalystKindSpec;
300
+
301
+ /**
302
+ * Improvement analyst — actionable, recursive self-improvement findings.
303
+ *
304
+ * Brief: read findings from upstream analysts (failure-mode,
305
+ * knowledge-gap, knowledge-poisoning) AND the trace dataset itself,
306
+ * then propose **concrete edits** to the agent's runtime: prompt
307
+ * additions, RAG documents to ingest, tool descriptions to rewrite,
308
+ * scaffolding changes to make, memory entries to invalidate. Each
309
+ * finding is one proposed edit with the locus, the diff, and the
310
+ * expected effect.
311
+ *
312
+ * This is the recursive-self-improvement loop's last mile: the prior
313
+ * kinds describe *what's wrong*; this kind describes *what to change*.
314
+ *
315
+ * Recursion is deep (`maxDepth: 3`) because real improvement proposals
316
+ * are competitive: for each failure-mode there are usually 2-3 viable
317
+ * fix directions (tighten prompt vs add tool vs adjust scaffolding),
318
+ * and the actor should explore each with a focused subagent before
319
+ * picking the highest-leverage one to recommend.
320
+ */
321
+
322
+ declare const IMPROVEMENT_KIND_SPEC: TraceAnalystKindSpec;
323
+
324
+ /**
325
+ * Knowledge-gap analyst — what did the agent NOT know that it needed?
326
+ *
327
+ * Brief: find moments in the trace where the agent had to guess, ask
328
+ * the user to fill in context, recover from a wrong assumption, or
329
+ * loop on a retrieval. Each finding names a *missing or outdated piece
330
+ * of knowledge* the agent's curated knowledge base should have held —
331
+ * or a downstream lookup (web, docs, tool description) that surfaced
332
+ * stale or outdated information.
333
+ *
334
+ * The primary expected store is `@tangle-network/agent-knowledge`: a
335
+ * Karpathy-style wiki the agent maintains with raw ↔ curated pages,
336
+ * source anchors, and claim/relation triples. A gap is anything the
337
+ * agent had to discover at run-time that should already have lived
338
+ * there. Secondary loci: web-search results that returned outdated
339
+ * pages, tool descriptions that omitted critical behavior, system-
340
+ * prompt sections that didn't cover the case.
341
+ *
342
+ * Distinct from failure-mode: failure-mode classifies *how* it broke;
343
+ * knowledge-gap names the *information* whose absence (or staleness)
344
+ * caused the break. One failure-mode often maps to several gaps.
345
+ *
346
+ * Recursion (`maxDepth: 2`) is enough to fan out one subagent per
347
+ * candidate gap-source layer; each subagent runs a focused detection.
348
+ */
349
+
350
+ declare const KNOWLEDGE_GAP_KIND_SPEC: TraceAnalystKindSpec;
351
+
352
+ /**
353
+ * Knowledge-poisoning analyst — what FALSE information misled the agent?
354
+ *
355
+ * Brief: find moments where the agent acted on information that was
356
+ * *wrong* — stale memory, RAG documents that contradicted ground truth,
357
+ * tool descriptions that lied about return shapes, system-prompt
358
+ * instructions that no longer matched reality, prior-run summaries that
359
+ * cached a wrong decision.
360
+ *
361
+ * Distinct from knowledge-gap: a gap is "the agent didn't know X"; a
362
+ * poisoning is "the agent confidently used X, but X was wrong." Gaps
363
+ * surface as questions / self-correction; poisonings surface as
364
+ * confident-but-wrong actions that downstream evidence contradicts.
365
+ *
366
+ * Recursion is moderate (`maxDepth: 2`) because each candidate
367
+ * poisoning typically needs two sub-investigations: one to confirm
368
+ * the agent acted on the false belief, one to confirm the belief
369
+ * itself is actually false in ground truth.
370
+ */
371
+
372
+ declare const KNOWLEDGE_POISONING_KIND_SPEC: TraceAnalystKindSpec;
373
+
374
+ /**
375
+ * Default analyst kinds focused on agent failure + recursive
376
+ * self-improvement.
377
+ *
378
+ * The four kinds chain: failure-mode classifies; knowledge-gap and
379
+ * knowledge-poisoning explain *why* in two orthogonal ways; improvement
380
+ * proposes concrete edits. Register all four against the same trace
381
+ * store and the registry runs them in dependency order if the operator
382
+ * pipes findings between them.
383
+ */
384
+
385
+ /**
386
+ * The default kind suite. Order is the run order operators should
387
+ * use: failure-mode first (no upstream deps), gap + poisoning next
388
+ * (both depend on failures), improvement last (chains all three).
389
+ */
390
+ declare const DEFAULT_TRACE_ANALYST_KINDS: readonly TraceAnalystKindSpec[];
391
+
392
+ /**
393
+ * Skill-usage analyst — a DETERMINISTIC `Analyst` over a Claude/Codex skill
394
+ * library + its trace corpus. Unlike the trace-store kinds (failure-mode,
395
+ * improvement, ...) this kind calls no LLM: it mines real usage and skill
396
+ * structure and emits findings by rule.
397
+ *
398
+ * It exists because the naive "Skill-tool invocation count" lies low — it
399
+ * misses orchestrated sub-dispatch (a leaf skill run BY /pursue or /governor
400
+ * logs under the parent), slash-command entry, local-script bypass, and
401
+ * on-disk artifacts. The 2026-05-30 skill audit found 39/53 skills at zero
402
+ * direct invocations, yet only one was a genuine cut: the rest were
403
+ * measurement-invisible or discovery-limited. This analyst encodes that
404
+ * lesson as a multi-signal usage model so a cheap repeatable pass can keep
405
+ * the library honest, and so the expensive audit workflow's verdicts can
406
+ * GEPA-distill it toward agreement (see `gold/skill-verdicts.gold.jsonl`).
407
+ *
408
+ * Report-building (`buildSkillUsageReport`, an fs scan) is separated from
409
+ * finding emission (`SkillUsageAnalyst.analyze`, pure) so the slow scan runs
410
+ * once at the registry boundary and the rule logic stays unit-testable.
411
+ */
412
+
413
+ type SkillKind = 'public' | 'private';
414
+ /** One skill's multi-signal usage + structure. All counts are deterministic. */
415
+ interface SkillUsageRecord {
416
+ name: string;
417
+ kind: SkillKind;
418
+ /** Absolute path to the skill's SKILL.md. */
419
+ path: string;
420
+ lines: number;
421
+ /** `"skill":"<name>"` Skill-tool invocations across the trace corpus. */
422
+ directInvocations: number;
423
+ /** `<command-name>/<name>` slash invocations across the trace corpus. */
424
+ slashInvocations: number;
425
+ /** Sibling skills whose SKILL.md dispatches to this one (`/<name>`). Proxy
426
+ * for orchestrated sub-dispatch the per-skill counter cannot see. */
427
+ inboundRefs: number;
428
+ /** On-disk artifacts attributable to the skill (e.g. `.evolve/<name>/**`). */
429
+ artifactCount: number;
430
+ /** Tangle-private reference count in the body (leak signal for public skills). */
431
+ tanglePrivateRefs: number;
432
+ hasReferencesDir: boolean;
433
+ hasEvalsDir: boolean;
434
+ /** Body mentions `skill-runs.jsonl` (visible to /reflect + /governor). */
435
+ logsRuns: boolean;
436
+ /** Description carries an explicit `Triggers:` clause / trigger phrases. */
437
+ hasTriggerPhrases: boolean;
438
+ }
439
+ interface SkillUsageReport {
440
+ generatedFromTraces: number;
441
+ records: SkillUsageRecord[];
442
+ }
443
+ interface SkillUsageScanConfig {
444
+ /** Dirs holding `*.jsonl` transcripts (Claude `~/.claude/projects`, Codex sessions). */
445
+ transcriptDirs: string[];
446
+ /** Skill roots to scan; each dir directly under `root` with a `SKILL.md` is a skill. */
447
+ skillRoots: {
448
+ root: string;
449
+ kind: SkillKind;
450
+ }[];
451
+ /** Roots scanned for `<root>/.evolve/<skill>` artifact dirs. */
452
+ artifactRoots?: string[];
453
+ /** Token-prefixed mappings: skill name → extra artifact subpaths under an artifactRoot
454
+ * (e.g. reflect → `.evolve/reflections`). Catches non-eponymous artifact dirs. */
455
+ artifactAliases?: Record<string, string[]>;
456
+ /** Cap files read per transcript dir (bounds a huge corpus); 0 = unbounded. */
457
+ maxTranscriptsPerDir?: number;
458
+ }
459
+ /** Scan the corpus + skill roots into a {@link SkillUsageReport}. Deterministic. */
460
+ declare function buildSkillUsageReport(config: SkillUsageScanConfig): SkillUsageReport;
461
+ /** Pure rule pass over a report → findings. Exported for direct/unit use. */
462
+ declare function emitSkillUsageFindings(report: SkillUsageReport, producedAt: string): AnalystFinding[];
463
+ declare class SkillUsageAnalyst implements Analyst<SkillUsageReport> {
464
+ readonly id = "skill-usage";
465
+ readonly description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
466
+ readonly inputKind: "custom";
467
+ readonly cost: {
468
+ kind: "deterministic";
469
+ est_usd_per_run: number;
470
+ };
471
+ readonly version = "1.0.0";
472
+ analyze(input: SkillUsageReport, ctx: AnalystContext): Promise<AnalystFinding[]>;
473
+ }
474
+ declare const SKILL_USAGE_ANALYST: SkillUsageAnalyst;
475
+
476
+ /**
477
+ * Deterministic behavioral metrics over OTLP spans — pure arithmetic, no LLM.
478
+ *
479
+ * These are the model-independent multiplier: the four trace-quality signals a
480
+ * tolerant analyzer (e.g. HALO) re-derives per run inside the model — token
481
+ * growth, output decay, tool monoculture, missing self-verification — computed
482
+ * here once, in TypeScript, with zero model judgment. A finding that falls out
483
+ * of arithmetic is trivially model-agnostic and cannot hallucinate the trend.
484
+ *
485
+ * General, not trace-specific: the detectors key off token trajectories and
486
+ * tool usage present in any agentic OTLP trace, not any one benchmark.
487
+ */
488
+
489
+ type SuboptimalCode = 'monotonic-input-growth' | 'output-length-decay' | 'single-tool-dependency' | 'no-self-verification';
490
+ interface SuboptimalSignal {
491
+ code: SuboptimalCode;
492
+ severity: 'high' | 'medium' | 'low';
493
+ /** Human-readable claim, with the backing numbers inlined. */
494
+ detail: string;
495
+ /** The exact figures the detector fired on — auditable, no model in the loop. */
496
+ evidence: Record<string, number | string | boolean>;
497
+ }
498
+ interface BehavioralMetrics {
499
+ llmCallCount: number;
500
+ inputTokenTrajectory: number[];
501
+ outputTokenTrajectory: number[];
502
+ toolHistogram: Record<string, number>;
503
+ totalToolCalls: number;
504
+ distinctTools: number;
505
+ /** distinct/total tool calls; 1.0 when there are no tool calls. */
506
+ toolDiversityRatio: number;
507
+ hasSelfVerification: boolean;
508
+ signals: SuboptimalSignal[];
509
+ }
510
+ /**
511
+ * Reduce a span list to behavioral metrics + fired suboptimality signals.
512
+ * Pure + deterministic: same spans → same output, on any machine, no model.
513
+ */
514
+ declare function computeTraceMetrics(spans: readonly TraceAnalystSpan[]): BehavioralMetrics;
515
+
516
+ /**
517
+ * Semantic concept judge — "does the built artifact actually implement
518
+ * the features the user asked for?"
519
+ *
520
+ * Distinct from the domain/code/coherence judges in `judges.ts`:
521
+ * - those judges score free-form conversational agent outputs along
522
+ * quality dimensions (accuracy, depth, etc.)
523
+ * - this judge scores a *built artifact* (served HTML + source files)
524
+ * against an explicit list of expected concepts, returning per-concept
525
+ * {present, score 0-10, evidence, severity}.
526
+ *
527
+ * The judge is strict about distinguishing (a) a working implementation
528
+ * from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
529
+ * Only real, functional, wired-up code counts.
530
+ *
531
+ * Use via {@link createSemanticConceptJudge} or directly via
532
+ * {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
533
+ * or JSON-parse errors so the caller can treat that as "layer skipped"
534
+ * rather than "layer failed" in a multi-layer pipeline.
535
+ */
536
+
537
+ /**
538
+ * Implementation complexity class for weighted scoring.
539
+ *
540
+ * - `render` (default): the concept is a UI surface that displays static
541
+ * data — render a list, show a counter, lay out a button. Single-file
542
+ * work, no external integration.
543
+ * - `integrate`: the concept requires wiring a real external system —
544
+ * wallet connect (wagmi + RainbowKit + chain config), payment provider
545
+ * (Stripe Elements + intent + webhook), an API client with auth.
546
+ * Multi-file, library-knowledge, runtime correctness matters.
547
+ * - `compute`: the concept requires algorithmic work — solver, simulator,
548
+ * constraint propagation, ML inference. Correctness > UI polish.
549
+ *
550
+ * Default weights (when applied via `weightConcepts: 'complexity'`):
551
+ * render=1.0, integrate=2.0, compute=2.5
552
+ *
553
+ * Cross-vertical scoring without complexity weighting silently inflates
554
+ * the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
555
+ * integration-heavy verticals (DeFi, wallets) — all concepts treated
556
+ * equally even though the agent does 2-3x the work for `integrate`.
557
+ */
558
+ type ConceptComplexity = 'render' | 'integrate' | 'compute';
559
+ interface ConceptSpec {
560
+ name: string;
561
+ /** Short hints that help the judge; not used for matching. */
562
+ keywords?: string[];
563
+ /** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
564
+ weight?: number;
565
+ /** Implementation complexity class. Default `render`. */
566
+ complexity?: ConceptComplexity;
567
+ }
568
+ interface ConceptFinding {
569
+ concept: string;
570
+ present: boolean;
571
+ /** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
572
+ score: number;
573
+ evidence: string;
574
+ severity: Severity;
575
+ }
576
+ interface SemanticConceptJudgeInput {
577
+ /** Full natural-language prompt the agent was handed. */
578
+ userRequest: string;
579
+ /** Rendered HTML the preview returns (UI artifacts). Optional. */
580
+ servedHtml?: string;
581
+ /** Top-level source files from the agent's workdir. */
582
+ sourceFiles: Array<{
583
+ path: string;
584
+ content: string;
585
+ }>;
586
+ /** The expected concept list. */
587
+ expectedConcepts: ConceptSpec[];
588
+ /** Free-form metadata (id, difficulty) to inject into the prompt. */
589
+ artifactLabel?: string;
590
+ artifactDescription?: string;
591
+ }
592
+ interface SemanticConceptJudgeResult {
593
+ kind: 'semantic-concept';
594
+ version: string;
595
+ /** Normalized 0..1 score — mean of per-concept scores / 10. */
596
+ score: number;
597
+ presentCount: number;
598
+ totalCount: number;
599
+ findings: ConceptFinding[];
600
+ summary: string;
601
+ durationMs: number;
602
+ costUsd: number | null;
603
+ /** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
604
+ available: boolean;
605
+ error?: string;
606
+ }
607
+ /**
608
+ * Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
609
+ * `complexity` applies the default weight table (render=1, integrate=2,
610
+ * compute=2.5) unless a concept has an explicit `weight`. `explicit`
611
+ * honors only `weight` (defaulting to 1 for unspecified).
612
+ */
613
+ type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
614
+ declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
615
+ interface SemanticConceptJudgeOptions {
616
+ /** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
617
+ model?: string;
618
+ /** Per-call timeout. Default 180s. */
619
+ timeoutMs?: number;
620
+ /** Pipeline budget for the prompt (source blob truncation). Default 45000. */
621
+ maxSourceChars?: number;
622
+ /** Per-file cap before inclusion. Default 20000. */
623
+ maxPerFileChars?: number;
624
+ /** HTML cap. Default 30000. */
625
+ maxHtmlChars?: number;
626
+ /** LlmClient config (baseUrl, apiKey, authHeader, …). */
627
+ llm?: LlmClientOptions;
628
+ /**
629
+ * Score aggregation strategy. Default `mean` — uniform average across
630
+ * concepts. Cross-vertical comparisons should use `complexity` to
631
+ * neutralize the integrate-vs-render asymmetry.
632
+ */
633
+ weightConcepts?: ConceptWeightStrategy;
634
+ /** Override the default complexity → weight table. */
635
+ complexityWeights?: Partial<Record<ConceptComplexity, number>>;
636
+ }
637
+ declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
638
+ /**
639
+ * Run the semantic concept judge. Soft-fails to available=false on
640
+ * LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
641
+ * that as "skip" rather than "fail."
642
+ */
643
+ declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
644
+ /**
645
+ * Factory: pin LLM options once, return a closure that accepts inputs.
646
+ * Convenient for pipelines that want to share a single LlmClient config.
647
+ */
648
+ declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
649
+
650
+ export { type ConceptFinding as A, type BehavioralMetrics as B, type CreateAnalystAiConfig as C, DEFAULT_TRACE_ANALYST_KINDS as D, type ConceptSpec as E, FAILURE_MODE_KIND_SPEC as F, type ConceptWeightStrategy as G, DEFAULT_COMPLEXITY_WEIGHTS as H, IMPROVEMENT_KIND_SPEC as I, SEMANTIC_CONCEPT_JUDGE_VERSION as J, KIND_EXPECTED_SUBJECTS as K, type SemanticConceptJudgeResult as L, type SuboptimalCode as M, type SuboptimalSignal as N, computeTraceMetrics as O, type PersistedFinding as P, createSemanticConceptJudge as Q, runSemanticConceptJudge as R, type SemanticConceptJudgeOptions as S, type SemanticConceptJudgeInput as a, type DefaultAnalystRegistryOptions as b, type DiffPolicy as c, FINDING_SUBJECT_GRAMMAR_PROMPT as d, FINDING_SUBJECT_KINDS as e, type FindingSubject as f, type FindingSubjectKind as g, FindingSubjectStringSchema as h, type FindingsDiff as i, FindingsStore as j, KNOWLEDGE_GAP_KIND_SPEC as k, KNOWLEDGE_POISONING_KIND_SPEC as l, SKILL_USAGE_ANALYST as m, SkillUsageAnalyst as n, type SkillUsageRecord as o, type SkillUsageReport as p, type SkillUsageScanConfig as q, buildDefaultAnalystRegistry as r, buildSkillUsageReport as s, createAnalystAi as t, defaultIsMaterial as u, diffFindings as v, emitSkillUsageFindings as w, parseFindingSubject as x, renderFindingSubject as y, type ConceptComplexity as z };
@@ -245,4 +245,4 @@ interface TraceAnalysisStore {
245
245
  }): Promise<SearchSpanResult>;
246
246
  }
247
247
 
248
- export { DEFAULT_TRACE_ANALYST_BUDGETS as D, type QueryTracesPage as Q, type SearchSpanResult as S, type TraceAnalysisStore as T, type ViewSpansResult as V, type DatasetOverview as a, type SearchTraceResult as b, type SpanMatchRecord as c, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as d, type TraceAnalystByteBudgets as e, type TraceAnalystFilters as f, type TraceAnalystSpan as g, type TraceAnalystSpanKind as h, type TraceAnalystSpanStatus as i, type TraceAnalystTraceSummary as j, type ViewTraceOversized as k, type ViewTraceResult as l };
248
+ export { DEFAULT_TRACE_ANALYST_BUDGETS as D, type QueryTracesPage as Q, type SearchSpanResult as S, type TraceAnalysisStore as T, type ViewSpansResult as V, type TraceAnalystSpan as a, type DatasetOverview as b, type SearchTraceResult as c, type SpanMatchRecord as d, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as e, type TraceAnalystByteBudgets as f, type TraceAnalystFilters as g, type TraceAnalystSpanKind as h, type TraceAnalystSpanStatus as i, type TraceAnalystTraceSummary as j, type ViewTraceOversized as k, type ViewTraceResult as l };