npm - @tangle-network/agent-eval - Versions diffs - 0.72.0 → 0.72.3 - Mend

@tangle-network/agent-eval 0.72.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/CHANGELOG.md +39 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +485 -9
package/dist/campaign/index.js +597 -22
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +13 -7
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +353 -2496
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-SL55X4VN.js +0 -186
package/dist/chunk-SL55X4VN.js.map +0 -1
package/dist/chunk-UD6EF73X.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0

package/dist/chunk-7W4SM7FD.js ADDED Viewed

@@ -0,0 +1,1075 @@
+import {
+  AnalystRegistry,
+  DEFAULT_TRACE_ANALYST_KINDS,
+  computeFindingId,
+  createTraceAnalystKind,
+  makeFinding
+} from "./chunk-WYIHD6EB.js";
+import {
+  LlmClient,
+  callLlmJson
+} from "./chunk-IHDHUN2X.js";
+import {
+  NotFoundError
+} from "./chunk-3BFEG2F6.js";
+// src/analyst/ax-service.ts
+import { ai } from "@ax-llm/ax";
+function createAnalystAi(config) {
+  return ai({
+    name: config.provider ?? "openai",
+    apiKey: config.apiKey,
+    apiURL: config.baseUrl,
+    config: { model: config.model }
+  });
+}
+// src/trace-analyst/behavioral-metrics.ts
+var INPUT_GROWTH_FACTOR = 3;
+var MIN_TOOL_CALLS = 3;
+var VERIFY_RE = /verif|eval|inspect|check|assert|validat|review|confirm/i;
+function num(v) {
+  return typeof v === "number" && Number.isFinite(v) ? v : null;
+}
+function inputTokensOf(s) {
+  return num(s.attributes["llm.input_tokens"]) ?? num(s.attributes["llm.usage.input_tokens"]);
+}
+function outputTokensOf(s) {
+  return num(s.attributes["llm.output_tokens"]) ?? num(s.attributes["llm.usage.output_tokens"]);
+}
+function stepOf(s) {
+  return num(s.attributes.step);
+}
+function toolNameOf(s) {
+  if (s.tool_name) return s.tool_name;
+  const t = s.attributes["tool.name"];
+  return typeof t === "string" && t.length > 0 ? t : null;
+}
+function computeTraceMetrics(spans) {
+  const ordered = [...spans].sort((a, b) => {
+    const sa = stepOf(a);
+    const sb = stepOf(b);
+    if (sa !== null && sb !== null && sa !== sb) return sa - sb;
+    return a.start_time.localeCompare(b.start_time);
+  });
+  const inputTokenTrajectory = [];
+  const outputTokenTrajectory = [];
+  const toolHistogram = {};
+  let hasSelfVerification = false;
+  for (const s of ordered) {
+    const inT = inputTokensOf(s);
+    if (inT !== null) inputTokenTrajectory.push(inT);
+    const outT = outputTokensOf(s);
+    if (outT !== null) outputTokenTrajectory.push(outT);
+    const tool = toolNameOf(s);
+    if (tool) {
+      toolHistogram[tool] = (toolHistogram[tool] ?? 0) + 1;
+      if (VERIFY_RE.test(tool)) hasSelfVerification = true;
+    }
+  }
+  const totalToolCalls = Object.values(toolHistogram).reduce((a, b) => a + b, 0);
+  const distinctTools = Object.keys(toolHistogram).length;
+  const toolDiversityRatio = totalToolCalls === 0 ? 1 : distinctTools / totalToolCalls;
+  const signals = [];
+  if (inputTokenTrajectory.length >= 3) {
+    const first = inputTokenTrajectory[0];
+    const last = inputTokenTrajectory[inputTokenTrajectory.length - 1];
+    const growth = first > 0 ? last / first : 0;
+    if (last > first && growth >= INPUT_GROWTH_FACTOR) {
+      signals.push({
+        code: "monotonic-input-growth",
+        severity: "high",
+        detail: `LLM input tokens grew ${growth.toFixed(1)}x (${first}\u2192${last}) across ${inputTokenTrajectory.length} calls \u2014 full history re-sent each step with no compression.`,
+        evidence: {
+          first,
+          last,
+          growth_x: Number(growth.toFixed(2)),
+          calls: inputTokenTrajectory.length
+        }
+      });
+    }
+  }
+  if (outputTokenTrajectory.length >= 3) {
+    const first = outputTokenTrajectory[0];
+    const last = outputTokenTrajectory[outputTokenTrajectory.length - 1];
+    if (last < first) {
+      signals.push({
+        code: "output-length-decay",
+        severity: "medium",
+        detail: `LLM output tokens shrank ${first}\u2192${last} over ${outputTokenTrajectory.length} calls \u2014 less planning/reasoning per step as context grows.`,
+        evidence: { first, last, calls: outputTokenTrajectory.length }
+      });
+    }
+  }
+  if (totalToolCalls >= MIN_TOOL_CALLS && distinctTools === 1) {
+    const only = Object.keys(toolHistogram)[0];
+    signals.push({
+      code: "single-tool-dependency",
+      severity: "medium",
+      detail: `All ${totalToolCalls} tool calls are \`${only}\` \u2014 no tool diversity and no fallback path.`,
+      evidence: { tool: only, calls: totalToolCalls, distinct_tools: 1 }
+    });
+  }
+  if (totalToolCalls >= MIN_TOOL_CALLS && !hasSelfVerification) {
+    signals.push({
+      code: "no-self-verification",
+      severity: "medium",
+      detail: `${totalToolCalls} tool calls and none verify/inspect/check state \u2014 the agent never validates its own actions.`,
+      evidence: { tool_calls: totalToolCalls, verification_calls: 0 }
+    });
+  }
+  return {
+    llmCallCount: inputTokenTrajectory.length,
+    inputTokenTrajectory,
+    outputTokenTrajectory,
+    toolHistogram,
+    totalToolCalls,
+    distinctTools,
+    toolDiversityRatio,
+    hasSelfVerification,
+    signals
+  };
+}
+// src/analyst/behavioral-analyst.ts
+var RECOMMENDED_ACTION = {
+  "monotonic-input-growth": "Add a context-budget instruction: once prior context exceeds a threshold, summarize earlier steps into a short status line instead of re-sending full history.",
+  "output-length-decay": "Require a minimum planning/reasoning budget per step so late steps do not degrade into terse, error-prone commands.",
+  "single-tool-dependency": "Direct the agent to use the full toolset (verify / inspect / alternate actions), not a single execute call, and to plan a fallback when a call returns an unexpected result.",
+  "no-self-verification": "After every state-mutating action, verify the result (eval / inspect / assert) before proceeding."
+};
+var ANALYST_ID = "efficiency-behavioral";
+function deriveEfficiencyFindings(metrics, opts = {}) {
+  const analystId = opts.analystId ?? ANALYST_ID;
+  return metrics.signals.map(
+    (sig) => makeFinding({
+      analyst_id: analystId,
+      area: "efficiency",
+      subject: sig.code,
+      // kebab — passes the cluster grammar; stable key for diffFindings
+      claim: sig.detail,
+      severity: sig.severity,
+      // Deterministic arithmetic over spans, not a model judgment → certain.
+      confidence: 1,
+      evidence_refs: [
+        {
+          kind: "metric",
+          uri: `metric://efficiency/${sig.code}`,
+          excerpt: JSON.stringify(sig.evidence)
+        }
+      ],
+      recommended_action: RECOMMENDED_ACTION[sig.code],
+      metadata: { deterministic: true, evidence: sig.evidence },
+      ...opts.producedAt ? { produced_at: opts.producedAt } : {}
+    })
+  );
+}
+function behavioralAnalyst() {
+  return {
+    id: ANALYST_ID,
+    description: "Deterministic behavioral/efficiency findings over OTLP spans \u2014 token-growth, output-decay, tool-monoculture, missing self-verification. Zero LLM; model-agnostic by construction.",
+    inputKind: "trace-store",
+    cost: { kind: "deterministic" },
+    version: "1.0.0",
+    async analyze(store) {
+      const overview = await store.getOverview();
+      const spans = [];
+      for (const traceId of overview.sample_trace_ids) {
+        const viewed = await store.viewTrace({ trace_id: traceId });
+        if (viewed.spans) spans.push(...viewed.spans);
+      }
+      return deriveEfficiencyFindings(computeTraceMetrics(spans));
+    }
+  };
+}
+// src/analyst/default-registry.ts
+function buildDefaultAnalystRegistry(opts = {}) {
+  const registry = new AnalystRegistry(opts.registry);
+  if (opts.includeBehavioral !== false) {
+    registry.register(behavioralAnalyst());
+  }
+  if (opts.ai) {
+    const kinds = opts.kinds ?? DEFAULT_TRACE_ANALYST_KINDS;
+    for (const spec of kinds) {
+      registry.register(createTraceAnalystKind(spec, { ai: opts.ai, model: opts.model }));
+    }
+  }
+  return registry;
+}
+// src/concurrency.ts
+var Mutex = class {
+  locked = false;
+  waiters = [];
+  async acquire() {
+    if (!this.locked) {
+      this.locked = true;
+      return () => this.release();
+    }
+    return new Promise((resolve) => {
+      this.waiters.push(() => {
+        resolve(() => this.release());
+      });
+    });
+  }
+  release() {
+    const next = this.waiters.shift();
+    if (next) {
+      next();
+    } else {
+      this.locked = false;
+    }
+  }
+  async runExclusive(fn) {
+    const release = await this.acquire();
+    try {
+      return await fn();
+    } finally {
+      release();
+    }
+  }
+  /** True iff someone holds the lock right now. Diagnostics only. */
+  get isLocked() {
+    return this.locked;
+  }
+  /** Pending waiter count. Diagnostics only. */
+  get pending() {
+    return this.waiters.length;
+  }
+};
+// src/locked-jsonl-appender.ts
+import { appendFileSync, existsSync, mkdirSync } from "fs";
+import { dirname } from "path";
+var mutexes = /* @__PURE__ */ new Map();
+function getMutex(path) {
+  let m = mutexes.get(path);
+  if (!m) {
+    m = new Mutex();
+    mutexes.set(path, m);
+  }
+  return m;
+}
+var LockedJsonlAppender = class {
+  constructor(path) {
+    this.path = path;
+    this.mutex = getMutex(path);
+    if (!existsSync(dirname(path))) {
+      mkdirSync(dirname(path), { recursive: true });
+    }
+  }
+  path;
+  mutex;
+  async append(entry) {
+    const line = `${JSON.stringify(entry)}
+`;
+    await this.mutex.runExclusive(() => {
+      appendFileSync(this.path, line);
+    });
+  }
+};
+function resetLockedAppendersForTesting() {
+  mutexes.clear();
+}
+// src/analyst/findings-store.ts
+import { existsSync as existsSync2, readFileSync } from "fs";
+var FindingsStore = class {
+  constructor(path) {
+    this.path = path;
+    this.appender = new LockedJsonlAppender(path);
+  }
+  path;
+  appender;
+  async append(runId, findings) {
+    for (const f of findings) {
+      const row = { ...f, run_id: runId };
+      await this.appender.append(row);
+    }
+  }
+  /** Load every persisted finding. Discards malformed trailing lines silently. */
+  loadAll() {
+    if (!existsSync2(this.path)) return [];
+    const raw = readFileSync(this.path, "utf8");
+    if (!raw) return [];
+    const out = [];
+    for (const line of raw.split("\n")) {
+      if (!line) continue;
+      try {
+        out.push(JSON.parse(line));
+      } catch {
+      }
+    }
+    return out;
+  }
+  /** Filter to a single run. */
+  loadRun(runId) {
+    return this.loadAll().filter((r) => r.run_id === runId);
+  }
+};
+function defaultIsMaterial(a, b) {
+  if (a.severity !== b.severity) return true;
+  if (Math.abs((a.confidence ?? 0) - (b.confidence ?? 0)) > 0.05) return true;
+  if (a.evidence_refs.length !== b.evidence_refs.length) return true;
+  return false;
+}
+function diffFindings(previous, current, policy = {}) {
+  const isMaterial = policy.isMaterial ?? defaultIsMaterial;
+  const prevById = new Map(previous.map((f) => [f.finding_id, f]));
+  const curById = new Map(current.map((f) => [f.finding_id, f]));
+  const appeared = [];
+  const disappeared = [];
+  const persisted = [];
+  const changed = [];
+  for (const [id, cur] of curById) {
+    const prev = prevById.get(id);
+    if (!prev) {
+      appeared.push(cur);
+      continue;
+    }
+    if (isMaterial(prev, cur)) {
+      changed.push({ previous: prev, current: cur });
+    } else {
+      persisted.push(cur);
+    }
+  }
+  for (const [id, prev] of prevById) {
+    if (!curById.has(id)) disappeared.push(prev);
+  }
+  return { appeared, disappeared, persisted, changed };
+}
+// src/analyst/kinds/skill-usage.ts
+import { existsSync as existsSync3, readdirSync, readFileSync as readFileSync2, statSync } from "fs";
+import { join } from "path";
+var BLOAT_LINE_THRESHOLD = 300;
+var TANGLE_PRIVATE_RE = /\b(cli-bridge|tangletools|ops-board|drew-gtr-pro|@tangle-network\/|~\/company|tangle\.tools|gtm-agent)\b|\bkimi\b|\btcloud\b/gi;
+var TRIGGER_RE = /triggers?\s*[:-]/i;
+function listSkillDirs(root) {
+  if (!existsSync3(root)) return [];
+  const out = [];
+  for (const entry of readdirSync(root, { withFileTypes: true })) {
+    if (!entry.isDirectory() && !entry.isSymbolicLink()) continue;
+    const skillMd = join(root, entry.name, "SKILL.md");
+    if (existsSync3(skillMd)) out.push({ name: entry.name, path: skillMd });
+  }
+  return out;
+}
+function walkJsonl(dir, cap) {
+  if (!existsSync3(dir)) return [];
+  const files = [];
+  const stack = [dir];
+  while (stack.length) {
+    const cur = stack.pop();
+    let entries;
+    try {
+      entries = readdirSync(cur, { withFileTypes: true });
+    } catch {
+      continue;
+    }
+    for (const e of entries) {
+      const full = join(cur, e.name);
+      if (e.isDirectory()) stack.push(full);
+      else if (e.name.endsWith(".jsonl")) {
+        files.push(full);
+        if (cap > 0 && files.length >= cap) return files;
+      }
+    }
+  }
+  return files;
+}
+function frontmatterDescription(body) {
+  const fm = /^---\n([\s\S]*?)\n---/.exec(body);
+  const block = fm?.[1] ?? "";
+  const m = /description:\s*(.+)/i.exec(block);
+  return m?.[1] ?? "";
+}
+function countArtifacts(roots, name, aliases) {
+  let n = 0;
+  for (const root of roots) {
+    const candidates = [join(root, ".evolve", name), ...aliases.map((a) => join(root, a))];
+    for (const dir of candidates) {
+      if (!existsSync3(dir)) continue;
+      try {
+        if (statSync(dir).isDirectory()) n += readdirSync(dir).length;
+        else n += 1;
+      } catch {
+      }
+    }
+  }
+  return n;
+}
+function buildSkillUsageReport(config) {
+  const skills = config.skillRoots.flatMap(
+    ({ root, kind }) => listSkillDirs(root).map((s) => ({ ...s, kind }))
+  );
+  const names = skills.map((s) => s.name);
+  const direct = new Map(names.map((n) => [n, 0]));
+  const slash = new Map(names.map((n) => [n, 0]));
+  const skillRe = /"skill"\s*:\s*"([a-z0-9_:-]+)"/g;
+  const cmdRe = /<command-name>\/?([a-z0-9_:-]+)<\/command-name>/g;
+  let transcripts = 0;
+  for (const dir of config.transcriptDirs) {
+    for (const file of walkJsonl(dir, config.maxTranscriptsPerDir ?? 0)) {
+      transcripts += 1;
+      let data;
+      try {
+        data = readFileSync2(file, "utf8");
+      } catch {
+        continue;
+      }
+      for (const m of data.matchAll(skillRe)) {
+        const g = m[1];
+        if (!g) continue;
+        const n = g.split(":").pop() ?? g;
+        const prev = direct.get(n);
+        if (prev !== void 0) direct.set(n, prev + 1);
+      }
+      for (const m of data.matchAll(cmdRe)) {
+        const g = m[1];
+        if (g === void 0) continue;
+        const prev = slash.get(g);
+        if (prev !== void 0) slash.set(g, prev + 1);
+      }
+    }
+  }
+  const bodies = /* @__PURE__ */ new Map();
+  for (const s of skills) {
+    try {
+      bodies.set(s.name, readFileSync2(s.path, "utf8"));
+    } catch {
+      bodies.set(s.name, "");
+    }
+  }
+  const inbound = new Map(names.map((n) => [n, 0]));
+  for (const target of names) {
+    const ref = new RegExp(`/${target}\\b|\\[\\[${target}\\]\\]`);
+    for (const s of skills) {
+      if (s.name === target) continue;
+      if (ref.test(bodies.get(s.name) ?? "")) inbound.set(target, inbound.get(target) + 1);
+    }
+  }
+  const records = skills.map((s) => {
+    const body = bodies.get(s.name) ?? "";
+    const dir = s.path.replace(/\/SKILL\.md$/, "");
+    return {
+      name: s.name,
+      kind: s.kind,
+      path: s.path,
+      lines: body ? body.split("\n").length : 0,
+      directInvocations: direct.get(s.name) ?? 0,
+      slashInvocations: slash.get(s.name) ?? 0,
+      inboundRefs: inbound.get(s.name) ?? 0,
+      artifactCount: countArtifacts(
+        config.artifactRoots ?? [],
+        s.name,
+        config.artifactAliases?.[s.name] ?? []
+      ),
+      tanglePrivateRefs: (body.match(TANGLE_PRIVATE_RE) ?? []).length,
+      hasReferencesDir: existsSync3(join(dir, "references")),
+      hasEvalsDir: existsSync3(join(dir, "evals")),
+      logsRuns: body.includes("skill-runs.jsonl"),
+      hasTriggerPhrases: TRIGGER_RE.test(frontmatterDescription(body) || body.slice(0, 600))
+    };
+  });
+  return { generatedFromTraces: transcripts, records };
+}
+var ANALYST_ID2 = "skill-usage";
+function finding(area, subject, claim, severity, confidence, producedAt, recommended, evidenceUri, rationale) {
+  return {
+    schema_version: "1.0.0",
+    finding_id: computeFindingId({ analyst_id: ANALYST_ID2, area, subject, claim }),
+    analyst_id: ANALYST_ID2,
+    produced_at: producedAt,
+    severity,
+    area,
+    claim,
+    rationale,
+    evidence_refs: [{ kind: "artifact", uri: evidenceUri }],
+    recommended_action: recommended,
+    confidence,
+    subject
+  };
+}
+function emitSkillUsageFindings(report, producedAt) {
+  const out = [];
+  for (const r of report.records) {
+    const directTotal = r.directInvocations + r.slashInvocations;
+    const trueUsage = directTotal + r.inboundRefs + r.artifactCount;
+    if (trueUsage === 0) {
+      out.push(
+        finding(
+          "skill-usage",
+          r.name,
+          `Skill '${r.name}' has zero usage across all signals (direct, slash, inbound-refs, artifacts)`,
+          "high",
+          0.6,
+          producedAt,
+          "Confirm the skill covers a real recurring job; if not, deprecate. Zero true usage is the only deterministic deprecation candidate.",
+          r.path,
+          "No Skill-tool call, no slash invocation, no sibling dispatches to it, and no on-disk artifacts."
+        )
+      );
+    } else if (directTotal === 0 && r.inboundRefs + r.artifactCount > 0) {
+      out.push(
+        finding(
+          "skill-usage",
+          r.name,
+          `Skill '${r.name}' shows 0 direct invocations but is used via orchestration/artifacts (inbound=${r.inboundRefs}, artifacts=${r.artifactCount})`,
+          "info",
+          0.8,
+          producedAt,
+          "Do NOT treat as unused \u2014 usage is real but logged under parent skills or on disk. Strengthen direct-invocation discovery only if direct use is desired.",
+          r.path,
+          "The Skill-tool counter undercounts orchestrated/chained leaf skills."
+        )
+      );
+    }
+    if (directTotal <= 2 && !r.hasTriggerPhrases) {
+      out.push(
+        finding(
+          "discoverability",
+          r.name,
+          `Skill '${r.name}' is rarely invoked directly and its description has no explicit trigger phrases`,
+          "medium",
+          0.7,
+          producedAt,
+          "Add a `Triggers:` clause with verbatim user phrases to the frontmatter description so the model auto-invokes it.",
+          r.path
+        )
+      );
+    }
+    if (r.kind === "public" && r.tanglePrivateRefs > 0) {
+      out.push(
+        finding(
+          "safety",
+          r.name,
+          `Public skill '${r.name}' carries ${r.tanglePrivateRefs} Tangle-private reference(s)`,
+          "high",
+          0.75,
+          producedAt,
+          "Sanitize incidental internal refs (cli-bridge/kimi/tcloud/~company/private repos) or relocate to a private repo. Verify @tangle-network/* refs are to PUBLISHED packages before treating as a leak.",
+          r.path
+        )
+      );
+    }
+    if (r.lines > BLOAT_LINE_THRESHOLD && !r.hasReferencesDir) {
+      out.push(
+        finding(
+          "maintainability",
+          r.name,
+          `Skill '${r.name}' is ${r.lines} lines with no references/ split (progressive disclosure)`,
+          "medium",
+          0.8,
+          producedAt,
+          `Split detail into references/ loaded on demand; keep SKILL.md a short overview. ${r.lines} lines load into every session's context budget.`,
+          r.path
+        )
+      );
+    }
+    if (!r.hasEvalsDir) {
+      out.push(
+        finding(
+          "data-quality",
+          r.name,
+          `Skill '${r.name}' ships no evals/`,
+          "low",
+          0.6,
+          producedAt,
+          "Add evals/evals.json with >=3 scenarios proving the skill beats baseline; gives regression coverage.",
+          r.path
+        )
+      );
+    }
+    if (!r.logsRuns) {
+      out.push(
+        finding(
+          "observability",
+          r.name,
+          `Skill '${r.name}' never appends to .evolve/skill-runs.jsonl`,
+          "low",
+          0.55,
+          producedAt,
+          "Append one run line to .evolve/skill-runs.jsonl on completion, or declare it a non-logging leaf, so the self-improvement loop can see it ran.",
+          r.path
+        )
+      );
+    }
+  }
+  return out;
+}
+var SkillUsageAnalyst = class {
+  id = ANALYST_ID2;
+  description = "Deterministic multi-signal skill-usage analysis: flags dead skills, measurement-invisible (orchestrated) usage, discovery gaps, public-repo leaks, bloat, missing evals, and missing run-logging.";
+  inputKind = "custom";
+  cost = { kind: "deterministic", est_usd_per_run: 0 };
+  version = "1.0.0";
+  async analyze(input, ctx) {
+    const producedAt = ctx.tags?.producedAt ?? (/* @__PURE__ */ new Date()).toISOString();
+    ctx.log?.(
+      `skill-usage: ${input.records.length} skills over ${input.generatedFromTraces} transcripts`
+    );
+    return emitSkillUsageFindings(input, producedAt);
+  }
+};
+var SKILL_USAGE_ANALYST = new SkillUsageAnalyst();
+// src/run-score.ts
+var DEFAULT_RUN_SCORE_WEIGHTS = {
+  success: 4,
+  goalProgress: 2,
+  repoGroundedness: 1.5,
+  driftPenalty: -1.5,
+  toolUseQuality: 1,
+  patchQuality: 1.25,
+  testReality: 1.5,
+  finalGate: 3,
+  reviewerBlockers: -2,
+  costUsd: -0.2,
+  wallSeconds: -0.1
+};
+function aggregateRunScore(score, weights = {}) {
+  const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
+  return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
+}
+function clamp01(value) {
+  if (!Number.isFinite(value)) return 0;
+  return Math.max(0, Math.min(1, value));
+}
+function finiteOrZero(value) {
+  return Number.isFinite(value) ? value : 0;
+}
+// src/run-critic.ts
+var DEFAULT_DRIFT_PATTERNS = [
+  /https?:\/\//i,
+  /\btitle:\s/i,
+  /\bsummary:\s/i,
+  /\burl:\s/i,
+  /\bnpm package usage\b/i,
+  /\bnews\b/i
+];
+var RunCritic = class {
+  weights;
+  driftPatterns;
+  constructor(options = {}) {
+    this.weights = options.weights;
+    this.driftPatterns = options.driftPatterns ?? DEFAULT_DRIFT_PATTERNS;
+  }
+  async score(store, runId) {
+    const run = await store.getRun(runId);
+    if (!run) throw new NotFoundError(`run ${runId} not found`);
+    const [spans, events, artifacts, budget] = await Promise.all([
+      store.spans({ runId }),
+      store.events({ runId }),
+      store.artifacts(runId),
+      store.budget(runId)
+    ]);
+    return this.scoreTrace({ run, spans, events, artifacts, budget });
+  }
+  scoreTrace(trace) {
+    const notes = [];
+    const llmSpans = trace.spans.filter(
+      (s) => s.kind === "llm"
+    );
+    const toolSpans = trace.spans.filter(
+      (s) => s.kind === "tool"
+    );
+    const judgeSpans = trace.spans.filter(
+      (s) => s.kind === "judge"
+    );
+    const sandboxSpans = trace.spans.filter(
+      (s) => s.kind === "sandbox"
+    );
+    const finalGateSpans = judgeSpans.filter(
+      (span) => span.dimension === "final_gate" || span.attributes?.finalGate === true
+    );
+    const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
+    if (!success) notes.push("run did not complete with pass=true");
+    const judgeAverage = judgeSpans.length ? judgeSpans.reduce((sum, span) => sum + normalizeJudgeScore(span.score), 0) / judgeSpans.length : void 0;
+    const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(
+      trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score
+    ) : void 0;
+    const goalProgress = outcomeScore ?? judgeAverage ?? success;
+    const successfulTools = toolSpans.filter((span) => span.status !== "error").length;
+    const toolUseQuality = toolSpans.length === 0 ? 0 : successfulTools / toolSpans.length;
+    if (toolSpans.length === 0) notes.push("no tool spans recorded");
+    const patchEvidence = trace.artifacts.length + toolSpans.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
+    const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
+    if (!patchQuality) notes.push("no artifact or edit evidence recorded");
+    const sandboxTests = sandboxSpans.filter(
+      (span) => typeof span.testsTotal === "number" && span.testsTotal > 0
+    );
+    const testReality = sandboxTests.length ? sandboxTests.reduce(
+      (sum, span) => sum + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1),
+      0
+    ) / sandboxTests.length : toolSpans.some(
+      (span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))
+    ) ? 0.4 : 0;
+    if (!testReality) notes.push("no real test/build evidence recorded");
+    const blockerSpans = judgeSpans.filter((span) => isBlockingJudge(span));
+    const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span));
+    const finalGate = finalGateSpans.length ? finalGateBlockers.length ? 0 : 1 : success;
+    if (finalGateBlockers.length)
+      notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`);
+    else if (!finalGateSpans.length) notes.push("no final gate judgment recorded");
+    const reviewerBlockers = judgeSpans.length ? blockerSpans.length / judgeSpans.length : 0;
+    if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`);
+    const positiveGroundingSignals = patchEvidence + sandboxSpans.length + llmSpans.filter((span) => looksRepoGrounded(span.output ?? "")).length;
+    const driftSignals = llmSpans.filter((span) => this.isDrift(span.output ?? "")).length + trace.events.filter((event) => this.isDrift(JSON.stringify(event.payload))).length;
+    const repoGroundedness = positiveGroundingSignals + driftSignals === 0 ? 0 : positiveGroundingSignals / (positiveGroundingSignals + driftSignals);
+    const driftPenalty = positiveGroundingSignals + driftSignals === 0 ? 0 : driftSignals / (positiveGroundingSignals + driftSignals);
+    if (driftSignals > 0) notes.push(`detected ${driftSignals} drift signal(s)`);
+    const costUsd = trace.budget.length ? Math.max(
+      ...trace.budget.filter((entry) => entry.dimension === "usd").map((entry) => entry.consumed),
+      0
+    ) : llmSpans.reduce((sum, span) => sum + (span.costUsd ?? 0), 0);
+    const wallSeconds = trace.run.endedAt && trace.run.startedAt ? Math.max(0, (trace.run.endedAt - trace.run.startedAt) / 1e3) : 0;
+    return {
+      success,
+      goalProgress,
+      repoGroundedness,
+      driftPenalty,
+      toolUseQuality,
+      patchQuality,
+      testReality,
+      finalGate,
+      reviewerBlockers,
+      costUsd,
+      wallSeconds,
+      notes
+    };
+  }
+  rank(score) {
+    return aggregateRunScore(score, this.weights);
+  }
+  isDrift(text) {
+    return this.driftPatterns.some((pattern) => pattern.test(text));
+  }
+};
+function normalizeJudgeScore(score) {
+  return score > 1 ? clamp01(score / 10) : clamp01(score);
+}
+function looksRepoGrounded(text) {
+  return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(
+    text
+  );
+}
+function isBlockingJudge(span) {
+  return span.attributes?.blocking === true || span.attributes?.verdict === "BLOCKING" || positiveNumber(span.attributes?.blockingFindings) || positiveNumber(span.attributes?.highFindings) || span.score <= 2;
+}
+function positiveNumber(value) {
+  return typeof value === "number" && value > 0;
+}
+// src/semantic-concept-judge.ts
+var DEFAULT_COMPLEXITY_WEIGHTS = {
+  render: 1,
+  integrate: 2,
+  compute: 2.5
+};
+var SEMANTIC_CONCEPT_JUDGE_VERSION = "semantic-concept-judge-v1-2026-04-24";
+var DEFAULT_MAX_SOURCE = 45e3;
+var DEFAULT_MAX_HTML = 3e4;
+var DEFAULT_MAX_PER_FILE = 2e4;
+var DEFAULT_TIMEOUT = 18e4;
+var DEFAULT_MODEL = "claude-sonnet-4-6";
+var SEMANTIC_SCHEMA = {
+  type: "object",
+  additionalProperties: false,
+  required: ["summary", "concepts"],
+  properties: {
+    summary: { type: "string", minLength: 20, maxLength: 600 },
+    concepts: {
+      type: "array",
+      minItems: 1,
+      items: {
+        type: "object",
+        additionalProperties: false,
+        required: ["concept", "present", "score", "evidence", "severity"],
+        properties: {
+          concept: { type: "string", minLength: 1, maxLength: 120 },
+          present: { type: "boolean" },
+          score: { type: "number", minimum: 0, maximum: 10 },
+          evidence: { type: "string", minLength: 5, maxLength: 400 },
+          severity: { type: "string", enum: ["critical", "major", "minor", "info"] }
+        }
+      }
+    }
+  }
+};
+function truncate(body, cap, label) {
+  if (body.length <= cap) return body;
+  return `${body.slice(0, cap)}
+\u2026 [truncated ${body.length - cap} chars of ${label}]`;
+}
+function buildPrompt(input, opts) {
+  const sourceBlob = input.sourceFiles.filter((f) => f.content.length <= opts.maxPerFileChars).map((f) => `--- FILE: ${f.path} ---
+${f.content}`).join("\n\n");
+  const html = input.servedHtml ?? "";
+  return `You are a strict code-review judge evaluating whether an agent's 0-to-1 build actually implements the features the user asked for.
+You MUST distinguish:
+  (a) WORKING code that implements the concept (rendered UI, wired handler, real API call),
+  (b) KEYWORD-PRESENT stub (comments mentioning the concept, variable names, TODOs),
+  (c) ABSENT (concept nowhere).
+A comment like "// TODO: add mint button" is NOT present \u2014 score 2-3. Only count a concept as present if there is real functional code: a rendered component, a call handler wired to state or a network call, a computed value actually used.
+USER REQUEST (what the agent was asked to build):
+${input.userRequest}
+${input.artifactLabel ? `ARTIFACT METADATA:
+  name: ${input.artifactLabel}
+  description: ${input.artifactDescription ?? ""}
+` : ""}EXPECTED CONCEPTS (each must be graded independently):
+${input.expectedConcepts.map(
+    (c, i) => `  ${i + 1}. "${c.name}"${c.keywords?.length ? ` \u2014 hints: [${c.keywords.slice(0, 6).join(" | ")}]` : ""}`
+  ).join("\n")}
+${html ? `SERVED HTML (what the preview returns when hit):
+${truncate(html, opts.maxHtmlChars, "HTML")}
+` : ""}SOURCE FILES (the agent's workdir):
+${truncate(sourceBlob, opts.maxSourceChars, "source")}
+For EACH concept, return:
+  - concept: the concept name as given (match exactly)
+  - present: boolean \u2014 does a working implementation exist?
+  - score: 0-10 \u2014 10 = production-ready; 7 = functional but thin; 4 = partial/stubbed; 2 = keyword-only comment; 0 = absent
+  - evidence: cite "<file>:<line>" or "served-html:<selector>" pointing at the strongest supporting code. If the concept is absent or stubbed, explain what's missing.
+  - severity:
+      "info" when present: true AND score >= 7
+      "minor" when present: true AND 4 <= score < 7
+      "major" when present: false OR score < 4
+      "critical" when the concept is not only absent but a core user flow depends on it
+Also produce a "summary" (one sentence, 20-600 chars): overall verdict on whether this is a shippable implementation of the user request vs a keyword-dense placeholder.
+BE SKEPTICAL. Keyword matching already passed \u2014 your job is to catch what keyword matching misses. If the agent shipped a working build, say so. If it shipped a stub, say so. Don't grade on effort.
+Return STRICT JSON. No prose outside the JSON.`;
+}
+async function runSemanticConceptJudge(input, options = {}) {
+  const start = Date.now();
+  const totalCount = input.expectedConcepts.length;
+  if (totalCount === 0) {
+    return {
+      kind: "semantic-concept",
+      version: SEMANTIC_CONCEPT_JUDGE_VERSION,
+      score: 0,
+      presentCount: 0,
+      totalCount: 0,
+      findings: [],
+      summary: "no expected concepts declared",
+      durationMs: 0,
+      costUsd: null,
+      available: false,
+      error: "no expected concepts declared"
+    };
+  }
+  const opts = {
+    model: options.model ?? DEFAULT_MODEL,
+    timeoutMs: options.timeoutMs ?? DEFAULT_TIMEOUT,
+    maxSourceChars: options.maxSourceChars ?? DEFAULT_MAX_SOURCE,
+    maxPerFileChars: options.maxPerFileChars ?? DEFAULT_MAX_PER_FILE,
+    maxHtmlChars: options.maxHtmlChars ?? DEFAULT_MAX_HTML,
+    llm: options.llm ?? {},
+    weightConcepts: options.weightConcepts ?? "mean",
+    complexityWeights: { ...DEFAULT_COMPLEXITY_WEIGHTS, ...options.complexityWeights ?? {} }
+  };
+  const weightForConcept = (spec) => {
+    if (opts.weightConcepts === "mean") return 1;
+    if (spec.weight != null) return spec.weight;
+    if (opts.weightConcepts === "complexity") {
+      return opts.complexityWeights[spec.complexity ?? "render"] ?? 1;
+    }
+    return 1;
+  };
+  const weightByName = new Map(
+    input.expectedConcepts.map((c) => [c.name, weightForConcept(c)])
+  );
+  try {
+    const { value, result } = await callLlmJson(
+      {
+        model: opts.model,
+        messages: [
+          {
+            role: "system",
+            content: "You are a strict code-review judge. Return strict JSON only. No prose outside the JSON. A keyword in a comment is NOT a working implementation."
+          },
+          { role: "user", content: buildPrompt(input, opts) }
+        ],
+        jsonSchema: { name: "semantic_concept_judge", schema: SEMANTIC_SCHEMA },
+        temperature: 0,
+        timeoutMs: opts.timeoutMs
+      },
+      opts.llm
+    );
+    if (!value?.concepts || !Array.isArray(value.concepts)) {
+      throw new Error('judge returned malformed response \u2014 expected array under "concepts"');
+    }
+    const findings = value.concepts.map((c) => ({
+      concept: String(c.concept),
+      present: Boolean(c.present),
+      score: Math.max(0, Math.min(10, Number(c.score ?? 0))),
+      evidence: String(c.evidence ?? ""),
+      severity: ["critical", "major", "minor", "info"].includes(c.severity) ? c.severity : "info"
+    }));
+    const presentCount = findings.filter((f) => f.present && f.score >= 7).length;
+    let weightSum = 0;
+    let weightedScoreSum = 0;
+    for (const f of findings) {
+      const w = weightByName.get(f.concept) ?? 1;
+      weightSum += w;
+      weightedScoreSum += w * f.score;
+    }
+    const scoreAvg = weightSum > 0 ? weightedScoreSum / weightSum : findings.reduce((a, f) => a + f.score, 0) / Math.max(1, findings.length);
+    return {
+      kind: "semantic-concept",
+      version: SEMANTIC_CONCEPT_JUDGE_VERSION,
+      score: Number((scoreAvg / 10).toFixed(3)),
+      presentCount,
+      totalCount,
+      findings,
+      summary: String(value.summary ?? ""),
+      durationMs: Date.now() - start,
+      costUsd: result.costUsd ?? null,
+      available: true
+    };
+  } catch (err) {
+    return {
+      kind: "semantic-concept",
+      version: SEMANTIC_CONCEPT_JUDGE_VERSION,
+      score: 0,
+      presentCount: 0,
+      totalCount,
+      findings: [],
+      summary: "",
+      durationMs: Date.now() - start,
+      costUsd: null,
+      available: false,
+      error: err instanceof Error ? err.message : String(err)
+    };
+  }
+}
+function createSemanticConceptJudge(options = {}) {
+  return (input) => runSemanticConceptJudge(input, options);
+}
+// src/analyst/chat-client.ts
+function createChatClient(opts) {
+  switch (opts.transport) {
+    case "router":
+      return wrapLlmClient(
+        opts.transport,
+        opts.defaultModel,
+        new LlmClient({
+          baseUrl: opts.baseUrl ?? "https://router.tangle.tools/v1",
+          apiKey: opts.apiKey
+        })
+      );
+    case "cli-bridge":
+      return wrapLlmClient(
+        opts.transport,
+        opts.defaultModel,
+        new LlmClient({
+          baseUrl: opts.baseUrl ?? "http://127.0.0.1:3344/v1",
+          apiKey: opts.bearer ?? ""
+        })
+      );
+    case "direct-provider":
+      return wrapLlmClient(
+        opts.transport,
+        opts.defaultModel,
+        new LlmClient({
+          baseUrl: opts.baseUrl,
+          apiKey: opts.apiKey
+        })
+      );
+    case "sandbox-sdk":
+      return {
+        transport: "sandbox-sdk",
+        defaultModel: opts.defaultModel,
+        chat: async (req, callOpts) => opts.chat(resolveModel(req, opts.defaultModel), callOpts)
+      };
+    case "mock":
+      return {
+        transport: "mock",
+        defaultModel: opts.defaultModel,
+        chat: async (req, callOpts) => opts.handler(resolveModel(req, opts.defaultModel), callOpts)
+      };
+  }
+}
+function wrapLlmClient(transport, defaultModel, inner) {
+  return {
+    transport,
+    defaultModel,
+    chat: async (req, callOpts) => {
+      const resolved = resolveModel(req, defaultModel);
+      const call = inner.call({
+        model: resolved.model,
+        messages: req.messages,
+        jsonMode: req.jsonMode,
+        jsonSchema: req.jsonSchema,
+        temperature: req.temperature,
+        maxTokens: req.maxTokens,
+        timeoutMs: req.timeoutMs
+      });
+      if (!callOpts?.signal) return await call;
+      return await Promise.race([call, abortAsRejection(callOpts.signal)]);
+    }
+  };
+}
+function abortAsRejection(signal) {
+  if (signal.aborted) return Promise.reject(toAbortError(signal));
+  return new Promise((_, reject) => {
+    signal.addEventListener("abort", () => reject(toAbortError(signal)), { once: true });
+  });
+}
+function toAbortError(signal) {
+  const reason = signal.reason;
+  if (reason instanceof Error) return reason;
+  const e = new Error("ChatClient.chat: aborted");
+  e.name = "AbortError";
+  return e;
+}
+function resolveModel(req, defaultModel) {
+  if (req.model) return req;
+  if (!defaultModel) {
+    throw new Error(
+      "ChatClient.chat: no model on request and no defaultModel on the client. Either pass req.model or bind defaultModel at createChatClient()."
+    );
+  }
+  return { ...req, model: defaultModel };
+}
+export {
+  createAnalystAi,
+  computeTraceMetrics,
+  deriveEfficiencyFindings,
+  behavioralAnalyst,
+  buildDefaultAnalystRegistry,
+  Mutex,
+  LockedJsonlAppender,
+  resetLockedAppendersForTesting,
+  FindingsStore,
+  defaultIsMaterial,
+  diffFindings,
+  buildSkillUsageReport,
+  emitSkillUsageFindings,
+  SkillUsageAnalyst,
+  SKILL_USAGE_ANALYST,
+  DEFAULT_RUN_SCORE_WEIGHTS,
+  aggregateRunScore,
+  clamp01,
+  RunCritic,
+  DEFAULT_COMPLEXITY_WEIGHTS,
+  SEMANTIC_CONCEPT_JUDGE_VERSION,
+  runSemanticConceptJudge,
+  createSemanticConceptJudge,
+  createChatClient
+};
+//# sourceMappingURL=chunk-7W4SM7FD.js.map