npm - @tangle-network/agent-eval - Versions diffs - 0.71.0 → 0.72.3 - Mend

@tangle-network/agent-eval 0.71.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

package/CHANGELOG.md +63 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +485 -9
package/dist/campaign/index.js +618 -30
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-VMAYE3LM.js → chunk-JYE3WOTE.js} +57 -9
package/dist/{chunk-VMAYE3LM.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-6QZUCFKM.js → chunk-XPILG2CA.js} +120 -3
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +14 -8
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +339 -2627
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-6QZUCFKM.js.map +0 -1
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-PQV2TKC3.js +0 -27
package/dist/chunk-PQV2TKC3.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
/package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0

package/dist/semantic-concept-judge-CV9Wlx4t.d.ts ADDED Viewed

@@ -0,0 +1,650 @@
+import { AxAIService } from '@ax-llm/ax';
+import { c as TraceAnalystKindSpec } from './kind-factory-DW9XWPvM.js';
+import { b as AnalystRegistryOptions, a as AnalystRegistry } from './registry-DuVYiTvw.js';
+import { z } from 'zod';
+import { c as AnalystFinding, A as Analyst, a as AnalystContext } from './types-CRD68aH7.js';
+import { a as TraceAnalystSpan } from './store-GmBE2pZZ.js';
+import { L as LlmClientOptions } from './llm-client-DbjLfz-K.js';
+import { S as Severity } from './multi-layer-verifier-DlWCXuxL.js';
+interface CreateAnalystAiConfig {
+    /** OpenAI-compatible API key forwarded as `Authorization: Bearer`.
+     *  cli-bridge ignores the value on loopback but Ax requires a non-empty string. */
+    apiKey: string;
+    /** OpenAI-compatible base URL — e.g. `https://router.tangle.tools/v1` or a
+     *  cli-bridge loopback. */
+    baseUrl: string;
+    /** Model id forwarded to the analyst actor + responder. */
+    model: string;
+    /** Ax provider name. Defaults to the OpenAI-compatible client. */
+    provider?: 'openai' | 'anthropic';
+}
+/**
+ * Construct the `AxAIService` an analyst kind calls through
+ * (`createTraceAnalystKind({ ai })`).
+ *
+ * Ax's `ai()` pins `config.model` to the OpenAI catalog enum, but every
+ * OpenAI-compatible router an analyst points at (router.tangle.tools,
+ * cli-bridge) accepts arbitrary model ids (claude-code/sonnet, openai/gpt-5.4,
+ * …). Consumers were each re-rolling `ai({ name, apiKey, apiURL, config })`
+ * behind an `as (a: any) => any` cast to dodge the enum; this is the one
+ * canonical constructor so they don't have to — and don't take a direct
+ * `@ax-llm/ax` dependency for it.
+ */
+declare function createAnalystAi(config: CreateAnalystAiConfig): AxAIService;
+/**
+ * `buildDefaultAnalystRegistry` — the canonical analyst suite, so consumers
+ * stop hand-wiring `new AnalystRegistry()` + per-kind `createTraceAnalystKind`.
+ *
+ * The deterministic `behavioralAnalyst` is ALWAYS registered (it needs no
+ * model and is model-agnostic by construction). The agentic RLM kinds are
+ * registered only when an `ai` service is supplied — so a caller with no LLM
+ * still gets the full behavioral/efficiency diagnosis, and the substrate's
+ * "any model (including no model)" guarantee holds at the suite level.
+ */
+interface DefaultAnalystRegistryOptions {
+    /** Ax service for the agentic RLM kinds. Omit → only the deterministic analyst. */
+    ai?: AxAIService;
+    /** Model for the agentic kinds (falls back to the ai service default). */
+    model?: string;
+    /** Which agentic kinds to register when `ai` is present. Default = the shipped suite. */
+    kinds?: readonly TraceAnalystKindSpec[];
+    /** Set false to omit the deterministic behavioral analyst (default: include). */
+    includeBehavioral?: boolean;
+    /** Forwarded to the AnalystRegistry constructor (signal, tags, priorFindings). */
+    registry?: AnalystRegistryOptions;
+}
+declare function buildDefaultAnalystRegistry(opts?: DefaultAnalystRegistryOptions): AnalystRegistry;
+/**
+ * Typed `FindingSubject` — the canonical grammar every analyst kind emits.
+ *
+ * Background: kind actor prompts have always documented a subject grammar
+ * (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`) but the
+ * LLM was unconstrained — it could emit `subject: "fix the prompt"`
+ * (prose) and downstream adapters routed on `startsWith(...)` would
+ * silently skip it. Every per-vertical `ImprovementAdapter` had a
+ * routing table that mostly caught nothing.
+ *
+ * This module fixes that:
+ *   - `parseFindingSubject(raw)` — returns the typed `FindingSubject`
+ *     when `raw` matches the grammar, else `null`. Used at the
+ *     `RawAnalystFindingSchema` boundary so malformed subjects are
+ *     rejected loudly instead of silently lifted into the registry.
+ *   - `FindingSubjectKind` — the union of valid locus categories. Each
+ *     variant carries the typed components downstream adapters resolve
+ *     against the agent's surface manifest (no string parsing in the
+ *     adapter).
+ *   - `FINDING_SUBJECT_GRAMMAR_PROMPT` — single source of truth for the
+ *     grammar string embedded in kind actor prompts. Drift between
+ *     prompt and parser is impossible if every kind imports this.
+ *
+ * The grammar is intentionally NARROW — only loci the substrate's
+ * default `ImprovementAdapter` / `KnowledgeAdapter` can act on. A
+ * finding with a subject outside this set fails the parser; the kind
+ * author either extends the grammar here (and adds adapter routing)
+ * or rephrases the prompt to map onto an existing variant.
+ *
+ * `failure-mode` is the one exception — its subjects are free-form
+ * cluster labels, not loci. The schema preserves them as
+ * `{ kind: 'cluster', label }` and the adapters skip them (cluster
+ * findings are evidence, not actionable mutations).
+ */
+/**
+ * Discriminated union of every locus the substrate can route findings to.
+ *
+ * Adapters narrow on `kind` and use the typed components (no string
+ * parsing). Adding a variant here REQUIRES updating the parser, the
+ * grammar prompt, and at least one adapter — by design.
+ */
+type FindingSubject = {
+    kind: 'knowledge.wiki';
+    slug: string;
+    heading?: string;
+} | {
+    kind: 'knowledge.claim';
+    topic: string;
+} | {
+    kind: 'knowledge.raw';
+    sourceId: string;
+} | {
+    kind: 'knowledge.stale';
+    slug: string;
+} | {
+    kind: 'system-prompt';
+    section: string;
+} | {
+    kind: 'tool-doc';
+    tool: string;
+    aspect?: string;
+} | {
+    kind: 'new-tool';
+    name: string;
+} | {
+    kind: 'rag';
+    corpus: string;
+    docId: string;
+} | {
+    kind: 'memory';
+    key: string;
+} | {
+    kind: 'scaffolding';
+    concern: string;
+} | {
+    kind: 'output-schema';
+    field: string;
+} | {
+    kind: 'websearch.outdated';
+    topic: string;
+} | {
+    kind: 'prior-run-summary';
+    topic: string;
+} | {
+    kind: 'cluster';
+    label: string;
+};
+type FindingSubjectKind = FindingSubject['kind'];
+declare const FINDING_SUBJECT_KINDS: ReadonlyArray<FindingSubjectKind>;
+/**
+ * Parse a raw subject string emitted by an analyst kind's actor.
+ *
+ * Returns the typed `FindingSubject` when `raw` matches the grammar,
+ * else `null`. Callers use the `null` return as a signal to either
+ * (a) reject the finding at parse time (kinds that emit typed loci —
+ * knowledge-gap, improvement, knowledge-poisoning) or (b) lift it as
+ * a cluster label (failure-mode).
+ *
+ * Slugs are constrained to `[a-z0-9-]+` (lowercase kebab) to keep file
+ * paths sane downstream. Topics / keys / sections allow any non-empty
+ * string (free-form for the LLM's voice) but get trimmed.
+ *
+ * Empty / whitespace-only inputs return `null`. `undefined` returns
+ * `null`. Both are surfaced by the caller as a rejected subject.
+ */
+declare function parseFindingSubject(raw: string | null | undefined): FindingSubject | null;
+/**
+ * Render the parsed subject back to its canonical string form. Inverse
+ * of `parseFindingSubject`; useful when the substrate constructs new
+ * findings programmatically (e.g. for tests, replays, or
+ * `id_basis` carry-forward).
+ */
+declare function renderFindingSubject(s: FindingSubject): string;
+/**
+ * The grammar text embedded into kind actor prompts. Kinds opt into
+ * the subset of variants they emit (e.g. `improvement` excludes the
+ * cluster variant; `failure-mode` includes ONLY the cluster variant).
+ *
+ * Drift between prompt and parser is impossible: every kind imports
+ * this constant + the matching `expects` set, and the unit tests below
+ * lock the table to the parser.
+ */
+declare const FINDING_SUBJECT_GRAMMAR_PROMPT: string;
+/**
+ * The variants each kind is allowed to emit. Used at the kind factory
+ * boundary so a knowledge-gap finding can't sneak in a `system-prompt:*`
+ * subject (the improvement-analyst's job) and vice versa.
+ *
+ * `failure-mode` is restricted to `cluster` — the only kind that emits
+ * a non-locus subject.
+ */
+declare const KIND_EXPECTED_SUBJECTS: Record<string, ReadonlyArray<FindingSubjectKind>>;
+/**
+ * Zod schema that validates a raw subject string and returns the parsed
+ * `FindingSubject`. Embedded in `RawAnalystFindingSchema` via
+ * `transform`, so `subject` arrives at the kind factory either as a
+ * typed locus or as a parse error attached to a single Zod issue.
+ *
+ * Optionality is preserved: subjects ARE optional on the wire (some
+ * findings are descriptive, not actionable). When present, they MUST
+ * parse — emitting a malformed subject is a contract violation, not a
+ * soft signal.
+ */
+declare const FindingSubjectStringSchema: z.ZodString;
+/**
+ * FindingsStore — durable persistence for AnalystFinding rows + a diff
+ * helper so we can answer "what changed since the last run?" without
+ * recomputing analysts.
+ *
+ * On-disk shape is JSONL: one finding per line, append-only, locked via
+ * LockedJsonlAppender. Operators get crash-safety (no partial JSON),
+ * cheap reads (sequential parse), and trivial backup (rsync the file).
+ *
+ * Reads are non-locking: a reader sees a consistent snapshot of all
+ * fully-written lines and skips an incomplete trailing line if the
+ * writer is mid-append. Cross-process locking is intentionally out of
+ * scope (see locked-jsonl-appender.ts).
+ *
+ * The store is run-scoped: callers pass `runId` on append and on load,
+ * which keeps multi-run files cleanly partitioned. The `diffFindings`
+ * helper compares two run-id sets using stable `finding_id` semantics —
+ * the diff is the cross-run signal the regression dashboard renders.
+ */
+/**
+ * One persisted row. We attach `run_id` on disk so a single file can
+ * hold multiple runs and the diff helper can query without re-walking
+ * separate files.
+ */
+interface PersistedFinding extends AnalystFinding {
+    run_id: string;
+}
+declare class FindingsStore {
+    readonly path: string;
+    private readonly appender;
+    constructor(path: string);
+    append(runId: string, findings: AnalystFinding[]): Promise<void>;
+    /** Load every persisted finding. Discards malformed trailing lines silently. */
+    loadAll(): PersistedFinding[];
+    /** Filter to a single run. */
+    loadRun(runId: string): PersistedFinding[];
+}
+interface FindingsDiff {
+    /** New finding ids in `current` that weren't in `previous`. */
+    appeared: PersistedFinding[];
+    /** Finding ids in `previous` that aren't in `current`. */
+    disappeared: PersistedFinding[];
+    /** Same finding id present in both runs and unchanged per the materiality test. */
+    persisted: PersistedFinding[];
+    /**
+     * Same finding id in both runs but at least one non-identity field
+     * shifted per `DiffPolicy.isMaterial`. Reported as [previous, current].
+     */
+    changed: Array<{
+        previous: PersistedFinding;
+        current: PersistedFinding;
+    }>;
+}
+interface DiffPolicy {
+    /**
+     * Predicate that decides whether two findings (same finding_id) count
+     * as a material change. Defaults to {@link defaultIsMaterial}: severity
+     * shift, confidence Δ > 0.05, or evidence count change. Compliance /
+     * perf consumers MAY supply a stricter predicate (e.g. rationale text
+     * diff, metric Δ thresholds).
+     */
+    isMaterial?: (previous: AnalystFinding, current: AnalystFinding) => boolean;
+}
+/**
+ * Default materiality test. Deliberately narrow so LLM-reword churn
+ * doesn't flood the diff. Stricter tests are opt-in via DiffPolicy.
+ */
+declare function defaultIsMaterial(a: AnalystFinding, b: AnalystFinding): boolean;
+/**
+ * Diff two findings sets by stable finding_id. Callers typically load
+ * the two run-id slices from the same store and pass them in.
+ */
+declare function diffFindings(previous: PersistedFinding[], current: PersistedFinding[], policy?: DiffPolicy): FindingsDiff;
+/**
+ * Failure-mode analyst — classifies what went wrong and why.
+ *
+ * Brief: read the trace dataset, identify the top failure modes across
+ * runs, classify each with severity + evidence, and surface them as
+ * findings. The actor's job is *taxonomy + evidence*, not fix-design —
+ * that's the improvement-analyst's job.
+ *
+ * Recursion is deep (`maxDepth: 3`) because real failure-mode
+ * discovery is genuinely tree-shaped: the actor splits the dataset
+ * into candidate clusters, each cluster spawns a focused investigator
+ * that drills into representative traces, and a deeply-recursed
+ * investigator may itself split a confounded mode into two sub-modes.
+ * Each level fans out 4-way, so the analyst can investigate up to
+ * ~16 leaf clusters before hitting the depth ceiling.
+ */
+declare const FAILURE_MODE_KIND_SPEC: TraceAnalystKindSpec;
+/**
+ * Improvement analyst — actionable, recursive self-improvement findings.
+ *
+ * Brief: read findings from upstream analysts (failure-mode,
+ * knowledge-gap, knowledge-poisoning) AND the trace dataset itself,
+ * then propose **concrete edits** to the agent's runtime: prompt
+ * additions, RAG documents to ingest, tool descriptions to rewrite,
+ * scaffolding changes to make, memory entries to invalidate. Each
+ * finding is one proposed edit with the locus, the diff, and the
+ * expected effect.
+ *
+ * This is the recursive-self-improvement loop's last mile: the prior
+ * kinds describe *what's wrong*; this kind describes *what to change*.
+ *
+ * Recursion is deep (`maxDepth: 3`) because real improvement proposals
+ * are competitive: for each failure-mode there are usually 2-3 viable
+ * fix directions (tighten prompt vs add tool vs adjust scaffolding),
+ * and the actor should explore each with a focused subagent before
+ * picking the highest-leverage one to recommend.
+ */
+declare const IMPROVEMENT_KIND_SPEC: TraceAnalystKindSpec;
+/**
+ * Knowledge-gap analyst — what did the agent NOT know that it needed?
+ *
+ * Brief: find moments in the trace where the agent had to guess, ask
+ * the user to fill in context, recover from a wrong assumption, or
+ * loop on a retrieval. Each finding names a *missing or outdated piece
+ * of knowledge* the agent's curated knowledge base should have held —
+ * or a downstream lookup (web, docs, tool description) that surfaced
+ * stale or outdated information.
+ *
+ * The primary expected store is `@tangle-network/agent-knowledge`: a
+ * Karpathy-style wiki the agent maintains with raw ↔ curated pages,
+ * source anchors, and claim/relation triples. A gap is anything the
+ * agent had to discover at run-time that should already have lived
+ * there. Secondary loci: web-search results that returned outdated
+ * pages, tool descriptions that omitted critical behavior, system-
+ * prompt sections that didn't cover the case.
+ *
+ * Distinct from failure-mode: failure-mode classifies *how* it broke;
+ * knowledge-gap names the *information* whose absence (or staleness)
+ * caused the break. One failure-mode often maps to several gaps.
+ *
+ * Recursion (`maxDepth: 2`) is enough to fan out one subagent per
+ * candidate gap-source layer; each subagent runs a focused detection.
+ */
+declare const KNOWLEDGE_GAP_KIND_SPEC: TraceAnalystKindSpec;
+/**
+ * Knowledge-poisoning analyst — what FALSE information misled the agent?
+ *
+ * Brief: find moments where the agent acted on information that was
+ * *wrong* — stale memory, RAG documents that contradicted ground truth,
+ * tool descriptions that lied about return shapes, system-prompt
+ * instructions that no longer matched reality, prior-run summaries that
+ * cached a wrong decision.
+ *
+ * Distinct from knowledge-gap: a gap is "the agent didn't know X"; a
+ * poisoning is "the agent confidently used X, but X was wrong." Gaps
+ * surface as questions / self-correction; poisonings surface as
+ * confident-but-wrong actions that downstream evidence contradicts.
+ *
+ * Recursion is moderate (`maxDepth: 2`) because each candidate
+ * poisoning typically needs two sub-investigations: one to confirm
+ * the agent acted on the false belief, one to confirm the belief
+ * itself is actually false in ground truth.
+ */
+declare const KNOWLEDGE_POISONING_KIND_SPEC: TraceAnalystKindSpec;
+/**
+ * Default analyst kinds focused on agent failure + recursive
+ * self-improvement.
+ *
+ * The four kinds chain: failure-mode classifies; knowledge-gap and
+ * knowledge-poisoning explain *why* in two orthogonal ways; improvement
+ * proposes concrete edits. Register all four against the same trace
+ * store and the registry runs them in dependency order if the operator
+ * pipes findings between them.
+ */
+/**
+ * The default kind suite. Order is the run order operators should
+ * use: failure-mode first (no upstream deps), gap + poisoning next
+ * (both depend on failures), improvement last (chains all three).
+ */
+declare const DEFAULT_TRACE_ANALYST_KINDS: readonly TraceAnalystKindSpec[];
+/**
+ * Skill-usage analyst — a DETERMINISTIC `Analyst` over a Claude/Codex skill
+ * library + its trace corpus. Unlike the trace-store kinds (failure-mode,
+ * improvement, ...) this kind calls no LLM: it mines real usage and skill
+ * structure and emits findings by rule.
+ *
+ * It exists because the naive "Skill-tool invocation count" lies low — it
+ * misses orchestrated sub-dispatch (a leaf skill run BY /pursue or /governor
+ * logs under the parent), slash-command entry, local-script bypass, and
+ * on-disk artifacts. The 2026-05-30 skill audit found 39/53 skills at zero
+ * direct invocations, yet only one was a genuine cut: the rest were
+ * measurement-invisible or discovery-limited. This analyst encodes that
+ * lesson as a multi-signal usage model so a cheap repeatable pass can keep
+ * the library honest, and so the expensive audit workflow's verdicts can
+ * GEPA-distill it toward agreement (see `gold/skill-verdicts.gold.jsonl`).
+ *
+ * Report-building (`buildSkillUsageReport`, an fs scan) is separated from
+ * finding emission (`SkillUsageAnalyst.analyze`, pure) so the slow scan runs
+ * once at the registry boundary and the rule logic stays unit-testable.
+ */
+type SkillKind = 'public' | 'private';
+/** One skill's multi-signal usage + structure. All counts are deterministic. */
+interface SkillUsageRecord {
+    name: string;
+    kind: SkillKind;
+    /** Absolute path to the skill's SKILL.md. */
+    path: string;
+    lines: number;
+    /** `"skill":"<name>"` Skill-tool invocations across the trace corpus. */
+    directInvocations: number;
+    /** `<command-name>/<name>` slash invocations across the trace corpus. */
+    slashInvocations: number;
+    /** Sibling skills whose SKILL.md dispatches to this one (`/<name>`). Proxy
+     *  for orchestrated sub-dispatch the per-skill counter cannot see. */
+    inboundRefs: number;
+    /** On-disk artifacts attributable to the skill (e.g. `.evolve/<name>/**`). */
+    artifactCount: number;
+    /** Tangle-private reference count in the body (leak signal for public skills). */
+    tanglePrivateRefs: number;
+    hasReferencesDir: boolean;
+    hasEvalsDir: boolean;
+    /** Body mentions `skill-runs.jsonl` (visible to /reflect + /governor). */
+    logsRuns: boolean;
+    /** Description carries an explicit `Triggers:` clause / trigger phrases. */
+    hasTriggerPhrases: boolean;
+}
+interface SkillUsageReport {
+    generatedFromTraces: number;
+    records: SkillUsageRecord[];
+}
+interface SkillUsageScanConfig {
+    /** Dirs holding `*.jsonl` transcripts (Claude `~/.claude/projects`, Codex sessions). */
+    transcriptDirs: string[];
+    /** Skill roots to scan; each dir directly under `root` with a `SKILL.md` is a skill. */
+    skillRoots: {
+        root: string;
+        kind: SkillKind;
+    }[];
+    /** Roots scanned for `<root>/.evolve/<skill>` artifact dirs. */
+    artifactRoots?: string[];
+    /** Token-prefixed mappings: skill name → extra artifact subpaths under an artifactRoot
+     *  (e.g. reflect → `.evolve/reflections`). Catches non-eponymous artifact dirs. */
+    artifactAliases?: Record<string, string[]>;
+    /** Cap files read per transcript dir (bounds a huge corpus); 0 = unbounded. */
+    maxTranscriptsPerDir?: number;
+}
+/** Scan the corpus + skill roots into a {@link SkillUsageReport}. Deterministic. */
+declare function buildSkillUsageReport(config: SkillUsageScanConfig): SkillUsageReport;
+/** Pure rule pass over a report → findings. Exported for direct/unit use. */
+declare function emitSkillUsageFindings(report: SkillUsageReport, producedAt: string): AnalystFinding[];
+declare class SkillUsageAnalyst implements Analyst<SkillUsageReport> {
+    readonly id = "skill-usage";
+    readonly description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
+    readonly inputKind: "custom";
+    readonly cost: {
+        kind: "deterministic";
+        est_usd_per_run: number;
+    };
+    readonly version = "1.0.0";
+    analyze(input: SkillUsageReport, ctx: AnalystContext): Promise<AnalystFinding[]>;
+}
+declare const SKILL_USAGE_ANALYST: SkillUsageAnalyst;
+/**
+ * Deterministic behavioral metrics over OTLP spans — pure arithmetic, no LLM.
+ *
+ * These are the model-independent multiplier: the four trace-quality signals a
+ * tolerant analyzer (e.g. HALO) re-derives per run inside the model — token
+ * growth, output decay, tool monoculture, missing self-verification — computed
+ * here once, in TypeScript, with zero model judgment. A finding that falls out
+ * of arithmetic is trivially model-agnostic and cannot hallucinate the trend.
+ *
+ * General, not trace-specific: the detectors key off token trajectories and
+ * tool usage present in any agentic OTLP trace, not any one benchmark.
+ */
+type SuboptimalCode = 'monotonic-input-growth' | 'output-length-decay' | 'single-tool-dependency' | 'no-self-verification';
+interface SuboptimalSignal {
+    code: SuboptimalCode;
+    severity: 'high' | 'medium' | 'low';
+    /** Human-readable claim, with the backing numbers inlined. */
+    detail: string;
+    /** The exact figures the detector fired on — auditable, no model in the loop. */
+    evidence: Record<string, number | string | boolean>;
+}
+interface BehavioralMetrics {
+    llmCallCount: number;
+    inputTokenTrajectory: number[];
+    outputTokenTrajectory: number[];
+    toolHistogram: Record<string, number>;
+    totalToolCalls: number;
+    distinctTools: number;
+    /** distinct/total tool calls; 1.0 when there are no tool calls. */
+    toolDiversityRatio: number;
+    hasSelfVerification: boolean;
+    signals: SuboptimalSignal[];
+}
+/**
+ * Reduce a span list to behavioral metrics + fired suboptimality signals.
+ * Pure + deterministic: same spans → same output, on any machine, no model.
+ */
+declare function computeTraceMetrics(spans: readonly TraceAnalystSpan[]): BehavioralMetrics;
+/**
+ * Semantic concept judge — "does the built artifact actually implement
+ * the features the user asked for?"
+ *
+ * Distinct from the domain/code/coherence judges in `judges.ts`:
+ *   - those judges score free-form conversational agent outputs along
+ *     quality dimensions (accuracy, depth, etc.)
+ *   - this judge scores a *built artifact* (served HTML + source files)
+ *     against an explicit list of expected concepts, returning per-concept
+ *     {present, score 0-10, evidence, severity}.
+ *
+ * The judge is strict about distinguishing (a) a working implementation
+ * from (b) a keyword-present stub. "// TODO: mint button" is NOT present.
+ * Only real, functional, wired-up code counts.
+ *
+ * Use via {@link createSemanticConceptJudge} or directly via
+ * {@link runSemanticConceptJudge}. Soft-fails (available=false) on LLM
+ * or JSON-parse errors so the caller can treat that as "layer skipped"
+ * rather than "layer failed" in a multi-layer pipeline.
+ */
+/**
+ * Implementation complexity class for weighted scoring.
+ *
+ * - `render` (default): the concept is a UI surface that displays static
+ *   data — render a list, show a counter, lay out a button. Single-file
+ *   work, no external integration.
+ * - `integrate`: the concept requires wiring a real external system —
+ *   wallet connect (wagmi + RainbowKit + chain config), payment provider
+ *   (Stripe Elements + intent + webhook), an API client with auth.
+ *   Multi-file, library-knowledge, runtime correctness matters.
+ * - `compute`: the concept requires algorithmic work — solver, simulator,
+ *   constraint propagation, ML inference. Correctness > UI polish.
+ *
+ * Default weights (when applied via `weightConcepts: 'complexity'`):
+ *   render=1.0, integrate=2.0, compute=2.5
+ *
+ * Cross-vertical scoring without complexity weighting silently inflates
+ * the rate of UI-heavy verticals (healthcare, fintech dashboards) vs
+ * integration-heavy verticals (DeFi, wallets) — all concepts treated
+ * equally even though the agent does 2-3x the work for `integrate`.
+ */
+type ConceptComplexity = 'render' | 'integrate' | 'compute';
+interface ConceptSpec {
+    name: string;
+    /** Short hints that help the judge; not used for matching. */
+    keywords?: string[];
+    /** Optional explicit weight; default 1.0. Overrides complexity-derived weight. */
+    weight?: number;
+    /** Implementation complexity class. Default `render`. */
+    complexity?: ConceptComplexity;
+}
+interface ConceptFinding {
+    concept: string;
+    present: boolean;
+    /** 0..10. 10 = production-ready; 7 = functional thin; 4 = partial; 0 = absent. */
+    score: number;
+    evidence: string;
+    severity: Severity;
+}
+interface SemanticConceptJudgeInput {
+    /** Full natural-language prompt the agent was handed. */
+    userRequest: string;
+    /** Rendered HTML the preview returns (UI artifacts). Optional. */
+    servedHtml?: string;
+    /** Top-level source files from the agent's workdir. */
+    sourceFiles: Array<{
+        path: string;
+        content: string;
+    }>;
+    /** The expected concept list. */
+    expectedConcepts: ConceptSpec[];
+    /** Free-form metadata (id, difficulty) to inject into the prompt. */
+    artifactLabel?: string;
+    artifactDescription?: string;
+}
+interface SemanticConceptJudgeResult {
+    kind: 'semantic-concept';
+    version: string;
+    /** Normalized 0..1 score — mean of per-concept scores / 10. */
+    score: number;
+    presentCount: number;
+    totalCount: number;
+    findings: ConceptFinding[];
+    summary: string;
+    durationMs: number;
+    costUsd: number | null;
+    /** False on LLM/JSON error — treat as "skipped / unable to judge" in pipelines. */
+    available: boolean;
+    error?: string;
+}
+/**
+ * Score-aggregation strategy. `mean` averages 0-10 scores uniformly.
+ * `complexity` applies the default weight table (render=1, integrate=2,
+ * compute=2.5) unless a concept has an explicit `weight`. `explicit`
+ * honors only `weight` (defaulting to 1 for unspecified).
+ */
+type ConceptWeightStrategy = 'mean' | 'complexity' | 'explicit';
+declare const DEFAULT_COMPLEXITY_WEIGHTS: Record<ConceptComplexity, number>;
+interface SemanticConceptJudgeOptions {
+    /** Model id to call. Default 'claude-sonnet-4-6' via agent-eval defaults. */
+    model?: string;
+    /** Per-call timeout. Default 180s. */
+    timeoutMs?: number;
+    /** Pipeline budget for the prompt (source blob truncation). Default 45000. */
+    maxSourceChars?: number;
+    /** Per-file cap before inclusion. Default 20000. */
+    maxPerFileChars?: number;
+    /** HTML cap. Default 30000. */
+    maxHtmlChars?: number;
+    /** LlmClient config (baseUrl, apiKey, authHeader, …). */
+    llm?: LlmClientOptions;
+    /**
+     * Score aggregation strategy. Default `mean` — uniform average across
+     * concepts. Cross-vertical comparisons should use `complexity` to
+     * neutralize the integrate-vs-render asymmetry.
+     */
+    weightConcepts?: ConceptWeightStrategy;
+    /** Override the default complexity → weight table. */
+    complexityWeights?: Partial<Record<ConceptComplexity, number>>;
+}
+declare const SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
+/**
+ * Run the semantic concept judge. Soft-fails to available=false on
+ * LLM/JSON errors — callers in a MultiLayerVerifier pipeline can treat
+ * that as "skip" rather than "fail."
+ */
+declare function runSemanticConceptJudge(input: SemanticConceptJudgeInput, options?: SemanticConceptJudgeOptions): Promise<SemanticConceptJudgeResult>;
+/**
+ * Factory: pin LLM options once, return a closure that accepts inputs.
+ * Convenient for pipelines that want to share a single LlmClient config.
+ */
+declare function createSemanticConceptJudge(options?: SemanticConceptJudgeOptions): (input: SemanticConceptJudgeInput) => Promise<SemanticConceptJudgeResult>;
+export { type ConceptFinding as A, type BehavioralMetrics as B, type CreateAnalystAiConfig as C, DEFAULT_TRACE_ANALYST_KINDS as D, type ConceptSpec as E, FAILURE_MODE_KIND_SPEC as F, type ConceptWeightStrategy as G, DEFAULT_COMPLEXITY_WEIGHTS as H, IMPROVEMENT_KIND_SPEC as I, SEMANTIC_CONCEPT_JUDGE_VERSION as J, KIND_EXPECTED_SUBJECTS as K, type SemanticConceptJudgeResult as L, type SuboptimalCode as M, type SuboptimalSignal as N, computeTraceMetrics as O, type PersistedFinding as P, createSemanticConceptJudge as Q, runSemanticConceptJudge as R, type SemanticConceptJudgeOptions as S, type SemanticConceptJudgeInput as a, type DefaultAnalystRegistryOptions as b, type DiffPolicy as c, FINDING_SUBJECT_GRAMMAR_PROMPT as d, FINDING_SUBJECT_KINDS as e, type FindingSubject as f, type FindingSubjectKind as g, FindingSubjectStringSchema as h, type FindingsDiff as i, FindingsStore as j, KNOWLEDGE_GAP_KIND_SPEC as k, KNOWLEDGE_POISONING_KIND_SPEC as l, SKILL_USAGE_ANALYST as m, SkillUsageAnalyst as n, type SkillUsageRecord as o, type SkillUsageReport as p, type SkillUsageScanConfig as q, buildDefaultAnalystRegistry as r, buildSkillUsageReport as s, createAnalystAi as t, defaultIsMaterial as u, diffFindings as v, emitSkillUsageFindings as w, parseFindingSubject as x, renderFindingSubject as y, type ConceptComplexity as z };

package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} RENAMED Viewed

@@ -245,4 +245,4 @@ interface TraceAnalysisStore {
     }): Promise<SearchSpanResult>;
 }
-export { DEFAULT_TRACE_ANALYST_BUDGETS as D, type QueryTracesPage as Q, type SearchSpanResult as S, type TraceAnalysisStore as T, type ViewSpansResult as V, type DatasetOverview as a, type SearchTraceResult as b, type SpanMatchRecord as c, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as d, type TraceAnalystByteBudgets as e, type TraceAnalystFilters as f, type TraceAnalystSpan as g, type TraceAnalystSpanKind as h, type TraceAnalystSpanStatus as i, type TraceAnalystTraceSummary as j, type ViewTraceOversized as k, type ViewTraceResult as l };
+export { DEFAULT_TRACE_ANALYST_BUDGETS as D, type QueryTracesPage as Q, type SearchSpanResult as S, type TraceAnalysisStore as T, type ViewSpansResult as V, type TraceAnalystSpan as a, type DatasetOverview as b, type SearchTraceResult as c, type SpanMatchRecord as d, TRACE_ANALYST_TRUNCATION_MARKER_PREFIX as e, type TraceAnalystByteBudgets as f, type TraceAnalystFilters as g, type TraceAnalystSpanKind as h, type TraceAnalystSpanStatus as i, type TraceAnalystTraceSummary as j, type ViewTraceOversized as k, type ViewTraceResult as l };