@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +2 -2
  3. package/dist/adapters/langchain.d.ts +2 -2
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  6. package/dist/analyst/index.d.ts +42 -8
  7. package/dist/analyst/index.js +32 -2
  8. package/dist/analyst/index.js.map +1 -1
  9. package/dist/authenticity/index.d.ts +54 -1
  10. package/dist/authenticity/index.js +88 -1
  11. package/dist/authenticity/index.js.map +1 -1
  12. package/dist/belief-state/index.d.ts +188 -0
  13. package/dist/belief-state/index.js +486 -0
  14. package/dist/belief-state/index.js.map +1 -0
  15. package/dist/benchmarks/index.d.ts +2 -2
  16. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  17. package/dist/campaign/index.d.ts +11 -11
  18. package/dist/campaign/index.js +4 -4
  19. package/dist/chunk-4DIJWVUT.js +131 -0
  20. package/dist/chunk-4DIJWVUT.js.map +1 -0
  21. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  22. package/dist/chunk-5LVWPNS5.js.map +1 -0
  23. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  24. package/dist/chunk-CF67I6QY.js.map +1 -0
  25. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  26. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  27. package/dist/chunk-KWRRMR3J.js.map +1 -0
  28. package/dist/chunk-NPCTHQIO.js +91 -0
  29. package/dist/chunk-NPCTHQIO.js.map +1 -0
  30. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  31. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  32. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  33. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  34. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  35. package/dist/contract/index.d.ts +128 -15
  36. package/dist/contract/index.js +118 -2
  37. package/dist/contract/index.js.map +1 -1
  38. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  39. package/dist/control.d.ts +2 -2
  40. package/dist/control.js +2 -2
  41. package/dist/governance/index.d.ts +1 -1
  42. package/dist/hosted/index.d.ts +4 -4
  43. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  44. package/dist/index.d.ts +127 -26
  45. package/dist/index.js +32 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  48. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  49. package/dist/meta-eval/index.d.ts +6 -99
  50. package/dist/meta-eval/index.js +7 -76
  51. package/dist/meta-eval/index.js.map +1 -1
  52. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  53. package/dist/openapi.json +1 -1
  54. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  55. package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
  56. package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
  57. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  60. package/dist/rl.d.ts +10 -140
  61. package/dist/rl.js +8 -122
  62. package/dist/rl.js.map +1 -1
  63. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
  64. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
  65. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  66. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
  67. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  68. package/dist/traces.d.ts +1 -1
  69. package/dist/traces.js +2 -2
  70. package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
  71. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  72. package/dist/workflow/index.d.ts +4 -4
  73. package/dist/workflow/index.js +1 -1
  74. package/docs/auto-research-loop-end-to-end.md +1 -1
  75. package/docs/feature-guide.md +4 -4
  76. package/docs/multi-shot-optimization.md +61 -115
  77. package/docs/product-eval-adoption.md +1 -1
  78. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  79. package/docs/research/research-roadmap.md +1 -0
  80. package/docs/three-package-architecture.md +1 -1
  81. package/docs/trace-analysis.md +19 -0
  82. package/package.json +7 -2
  83. package/dist/chunk-7W4SM7FD.js.map +0 -1
  84. package/dist/chunk-F3SRAAZO.js.map +0 -1
  85. package/dist/chunk-JYE3WOTE.js.map +0 -1
  86. package/dist/chunk-WYIHD6EB.js.map +0 -1
  87. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  88. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  89. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  90. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -5,6 +5,33 @@ var DEFAULT_MOCK = /\bmock|\bfake|\bdummy|\bstub\b|simulat|hardcoded|placeholder
5
5
  function basename(p) {
6
6
  return p.split("/").pop() ?? p;
7
7
  }
8
+ function escapeRe(s) {
9
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
10
+ }
11
+ function declaredNames(content) {
12
+ const names = /* @__PURE__ */ new Set();
13
+ const re = /\b(?:contract|library|interface|abstract\s+contract|class|enum|struct|module|package)\s+([A-Za-z_]\w*)/g;
14
+ let m;
15
+ while (m = re.exec(content)) {
16
+ const name = m[1];
17
+ if (name && name.length >= 4) names.add(name);
18
+ }
19
+ return [...names];
20
+ }
21
+ function isArtifactReferenced(required, others) {
22
+ if (!required.length || !others.length) return false;
23
+ return required.some((rf) => {
24
+ const stem = rf.path.replace(/\.[^.]+$/, "");
25
+ const base = basename(rf.path);
26
+ const names = declaredNames(rf.content ?? "");
27
+ return others.some((o) => {
28
+ const c = o.content ?? "";
29
+ if (!c) return false;
30
+ if (c.includes(base) || c.includes(stem)) return true;
31
+ return names.some((n) => new RegExp(`\\b${escapeRe(n)}\\b`).test(c));
32
+ });
33
+ });
34
+ }
8
35
  function scoreAuthenticity(files, signals) {
9
36
  const w = {
10
37
  artifact: signals.weights?.artifact ?? 40,
@@ -24,6 +51,8 @@ function scoreAuthenticity(files, signals) {
24
51
  const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText);
25
52
  const realInfra = signals.realInfra.test(allText);
26
53
  const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false;
54
+ const artifactReferenced = isArtifactReferenced(required, others);
55
+ const artifactWired = wired || artifactReferenced;
27
56
  const fakeShim = files.some(
28
57
  (f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? "")
29
58
  );
@@ -32,6 +61,7 @@ function scoreAuthenticity(files, signals) {
32
61
  ) ?? []).length;
33
62
  const loc = Math.max(1, allText.split("\n").length);
34
63
  const mockDensity = Math.min(100, Math.round(mockHits / loc * 1e3));
64
+ const decorativeArtifact = requiredArtifactPresent && usesRealImpl && !artifactWired;
35
65
  let realness = 0;
36
66
  if (requiredArtifactPresent) realness += w.artifact;
37
67
  if (usesRealImpl) realness += w.impl;
@@ -56,6 +86,10 @@ function scoreAuthenticity(files, signals) {
56
86
  flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`);
57
87
  if (signals.wiring && requiredArtifactPresent && !wired)
58
88
  flags.push("NOT_WIRED: artifact exists but is never used by the client");
89
+ if (decorativeArtifact)
90
+ flags.push(
91
+ "DEAD_ARTIFACT: required artifact is not referenced/imported anywhere \u2014 decorative or dead code"
92
+ );
59
93
  return {
60
94
  realness,
61
95
  requiredArtifactPresent,
@@ -63,6 +97,8 @@ function scoreAuthenticity(files, signals) {
63
97
  usesRealImpl,
64
98
  realInfra,
65
99
  wired,
100
+ artifactReferenced,
101
+ artifactWired,
66
102
  fakeShim,
67
103
  mockDensity,
68
104
  flags
@@ -76,6 +112,9 @@ function gateRealness(r, opts = {}) {
76
112
  if (r.fakeShim && !r.usesRealImpl) {
77
113
  return { gated: true, reason: "fake shim with no real implementation" };
78
114
  }
115
+ if (opts.requireArtifactWired && r.requiredArtifactPresent && r.usesRealImpl && !r.artifactWired) {
116
+ return { gated: true, reason: "required artifact present but never wired into the system" };
117
+ }
79
118
  if (r.realness < floor)
80
119
  return { gated: true, reason: `realness ${r.realness} below floor ${floor}` };
81
120
  return { gated: false };
@@ -120,9 +159,57 @@ ${fileDigest(files, { prioritize: opts.prioritize })}`;
120
159
  };
121
160
  }
122
161
  }
162
+ async function judgeRealnessLlm(files, complete, opts = {}) {
163
+ const system = "You are a skeptical auditor. Rate how REAL an agent's build is vs the intended deliverable, 0-100. A genuine implementation of the HARD part on the intended infrastructure is SUBSTANTIALLY REAL (>=50) even if peripheral layers are stubbed; a pure simulator / facade / branded-type stand-in / no-op-stubbed dependency with no real implementation is FAKE (<=25). Judge the core on its merits and note the runtime. " + (opts.rubric ? `Domain rubric: ${opts.rubric} ` : "") + 'Respond with ONLY JSON: {"isReal":0-100,"why":"one sentence"}.';
164
+ const user = (opts.intent ? `Intended deliverable: ${opts.intent}
165
+
166
+ ` : "") + `Produced files:
167
+ ${fileDigest(files, { prioritize: opts.prioritize })}`;
168
+ try {
169
+ const raw = await complete(system, user);
170
+ const m = raw.match(/\{[\s\S]*\}/);
171
+ if (!m) return { isReal: 0, rationale: "unparseable judge response" };
172
+ const j = JSON.parse(m[0]);
173
+ return {
174
+ isReal: clampPct(j.isReal),
175
+ rationale: typeof j.why === "string" ? j.why : ""
176
+ };
177
+ } catch (err) {
178
+ return {
179
+ isReal: 0,
180
+ rationale: `judge error: ${err instanceof Error ? err.message : String(err)}`
181
+ };
182
+ }
183
+ }
184
+ async function scoreRealnessBlended(files, signals, complete, opts = {}) {
185
+ const det = scoreAuthenticity(files, signals);
186
+ const [lo, hi] = opts.grayBand ?? [30, 70];
187
+ const mockGray = opts.mockGrayThreshold ?? 8;
188
+ const conflict = det.requiredArtifactPresent && det.usesRealImpl && (det.fakeShim || !det.wired || det.mockDensity >= mockGray);
189
+ const midRange = det.realness >= lo && det.realness <= hi;
190
+ let band;
191
+ if (conflict || midRange) band = "gray";
192
+ else if (det.realness < lo) band = "clean-fake";
193
+ else band = "clean-real";
194
+ if (band !== "gray") {
195
+ return { ...det, blendedRealness: det.realness, band, consultedLlm: false };
196
+ }
197
+ const judgment = await judgeRealnessLlm(files, complete, {
198
+ intent: opts.intent,
199
+ rubric: opts.rubric,
200
+ prioritize: signals.requiredArtifact
201
+ });
202
+ const blendedRealness = Math.max(
203
+ 0,
204
+ Math.min(100, Math.round(0.25 * det.realness + 0.75 * judgment.isReal))
205
+ );
206
+ return { ...det, blendedRealness, band, consultedLlm: true, judgment };
207
+ }
123
208
  export {
124
209
  gateRealness,
210
+ judgeRealnessLlm,
125
211
  scoreAuthenticity,
126
- scoreAuthenticityNuance
212
+ scoreAuthenticityNuance,
213
+ scoreRealnessBlended
127
214
  };
128
215
  //# sourceMappingURL=index.js.map
@@ -1 +1 @@
1
- {"version":3,"sources":["../../src/authenticity/index.ts"],"sourcesContent":["/**\n * Authenticity — \"is this real, or convincing BS?\"\n *\n * Pass/build-style scoring rewards anything that compiles and renders, so an\n * agent can ship a polished frontend with a FAKE in-browser engine and zero of\n * the required on-chain/contract work, and outscore a half-finished real\n * implementation. This module scores what buildability does not: did the agent\n * actually build the intended thing on the intended infra, or fake it.\n *\n * Two layers:\n * - DETERMINISTIC `scoreAuthenticity` — calibrated by construction (no LLM,\n * trustworthy today). Structural signals over the produced files, driven by\n * a domain `AuthenticitySignals` config: required artifact present, real\n * implementation of the hard part, real infra calls, wiring, fake-shim\n * detection, mock/stub density.\n * - LLM NUANCE `scoreAuthenticityNuance` — mocked% / fake% / unique% for the\n * \"looks real but is hollow\" cases structure can't see.\n *\n * `gateRealness` is the anti-Goodhart gate: a submission missing the required\n * artifact (or faking it) is capped and cannot rank high regardless of how\n * buildable it is. Domain-agnostic; ships a Solidity/Fhenix preset.\n *\n * Input is the produced-state currency: `{ path, content }[]` — exactly what\n * `extractProducedState(...).artifacts` yields, so any consumer can feed a run's\n * produced state straight in.\n */\n\nexport interface ProducedFile {\n path: string\n content?: string\n}\n\nexport interface AuthenticitySignals {\n /** Human label for the domain (e.g. 'fhenix-fhe'). */\n label: string\n /** A file the task REQUIRES (e.g. /\\.sol$/ for an on-chain task). */\n requiredArtifact?: RegExp\n /** Vendored/3rd-party paths to exclude from required-artifact detection. */\n vendored?: RegExp\n /** Real implementation of the hard part, inside the required artifact\n * (e.g. Fhenix encrypted types + FHE.* ops). Matched against content, so it\n * fails on comments/strings only if the regex is written tightly. */\n realImpl: RegExp\n /** Real use of the intended client infra (e.g. cofhejs.encrypt() calls). */\n realInfra: RegExp\n /** Evidence the artifact is actually wired/used (e.g. contract writes). */\n wiring?: RegExp\n /** A fake shim standing in for the real thing — matched on file path AND body. */\n fakeShim: RegExp\n /** Mock/stub/TODO markers. Defaults to a generic set. */\n mock?: RegExp\n /** Score weights (default 40/25/20/15). */\n weights?: { artifact?: number; impl?: number; infra?: number; wiring?: number }\n}\n\nexport interface AuthenticityResult {\n /** Deterministic realness, 0 (BS) … 100 (real on real infra). */\n realness: number\n requiredArtifactPresent: boolean\n requiredArtifactCount: number\n usesRealImpl: boolean\n realInfra: boolean\n wired: boolean\n fakeShim: boolean\n /** mock/stub markers per 1000 LOC, capped at 100. */\n mockDensity: number\n /** Human-readable BS flags — what's missing or faked. */\n flags: string[]\n}\n\nconst DEFAULT_MOCK =\n /\\bmock|\\bfake|\\bdummy|\\bstub\\b|simulat|hardcoded|placeholder|TODO|not\\s+implemented|FIXME/i\n\nfunction basename(p: string): string {\n return p.split('/').pop() ?? p\n}\n\n/** Deterministic authenticity scan of produced files. Pure — same files in,\n * same score out. No LLM, no IO. */\nexport function scoreAuthenticity(\n files: readonly ProducedFile[],\n signals: AuthenticitySignals,\n): AuthenticityResult {\n const w = {\n artifact: signals.weights?.artifact ?? 40,\n impl: signals.weights?.impl ?? 25,\n infra: signals.weights?.infra ?? 20,\n wiring: signals.weights?.wiring ?? 15,\n }\n const mockRe = signals.mock ?? DEFAULT_MOCK\n\n const required = signals.requiredArtifact\n ? files.filter(\n (f) => signals.requiredArtifact!.test(f.path) && !(signals.vendored?.test(f.path) ?? false),\n )\n : []\n const others = signals.requiredArtifact ? files.filter((f) => !required.includes(f)) : files\n\n const requiredText = required.map((f) => f.content ?? '').join('\\n')\n const otherText = others.map((f) => f.content ?? '').join('\\n')\n const allText = files.map((f) => f.content ?? '').join('\\n')\n\n const requiredArtifactPresent = signals.requiredArtifact ? required.length > 0 : true\n // Real impl looked for in the required artifact when there is one, else anywhere.\n const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText)\n const realInfra = signals.realInfra.test(allText)\n const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false\n const fakeShim = files.some(\n (f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? ''),\n )\n\n const mockHits = (\n allText.match(\n new RegExp(mockRe.source, mockRe.flags.includes('g') ? mockRe.flags : `${mockRe.flags}g`),\n ) ?? []\n ).length\n const loc = Math.max(1, allText.split('\\n').length)\n const mockDensity = Math.min(100, Math.round((mockHits / loc) * 1000))\n\n let realness = 0\n if (requiredArtifactPresent) realness += w.artifact\n if (usesRealImpl) realness += w.impl\n if (realInfra) realness += w.infra\n if (wired) realness += w.wiring\n if (fakeShim) realness -= 25\n realness -= Math.min(20, mockDensity)\n realness = Math.max(0, Math.min(100, realness))\n\n const flags: string[] = []\n if (signals.requiredArtifact && !requiredArtifactPresent) {\n flags.push(\n `NO_REQUIRED_ARTIFACT: task needs ${signals.label} artifact (${signals.requiredArtifact}); none produced`,\n )\n }\n if (requiredArtifactPresent && signals.requiredArtifact && !usesRealImpl) {\n flags.push('ARTIFACT_NO_REAL_IMPL: required artifact exists but lacks the real implementation')\n }\n if (fakeShim) flags.push('FAKE_SHIM: ships a client-side stand-in simulating the real infra')\n if (!realInfra && !requiredArtifactPresent)\n flags.push('NO_REAL_INFRA: no real infra calls — cosmetic at best')\n if (mockDensity >= 8)\n flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`)\n if (signals.wiring && requiredArtifactPresent && !wired)\n flags.push('NOT_WIRED: artifact exists but is never used by the client')\n\n return {\n realness,\n requiredArtifactPresent,\n requiredArtifactCount: required.length,\n usesRealImpl,\n realInfra,\n wired,\n fakeShim,\n mockDensity,\n flags,\n }\n}\n\nexport interface RealnessGate {\n gated: boolean\n reason?: string\n}\n\n/** Anti-Goodhart gate: a required-artifact-missing or faked submission is\n * capped and cannot rank high regardless of buildability. */\nexport function gateRealness(\n r: AuthenticityResult,\n opts: { floor?: number; requireArtifact?: boolean } = {},\n): RealnessGate {\n const floor = opts.floor ?? 30\n if ((opts.requireArtifact ?? true) && !r.requiredArtifactPresent) {\n return { gated: true, reason: 'required artifact missing' }\n }\n if (r.fakeShim && !r.usesRealImpl) {\n return { gated: true, reason: 'fake shim with no real implementation' }\n }\n if (r.realness < floor)\n return { gated: true, reason: `realness ${r.realness} below floor ${floor}` }\n return { gated: false }\n}\n\n// ── LLM nuance layer ─────────────────────────────────────────────────────────\n\nexport interface AuthenticityNuance {\n /** 0 (nothing mocked) … 100 (entirely mocked). */\n mockedPct: number\n /** 0 (genuine) … 100 (a hollow facade / cargo-culted). */\n fakePct: number\n /** 0 (boilerplate/template clone) … 100 (distinctive real work). */\n uniquePct: number\n verdict: string\n}\n\n/** A minimal completion fn — inject your model caller (router/tcloud). Keeps\n * this module free of any specific LLM client. */\nexport type CompleteFn = (system: string, user: string) => Promise<string>\n\nfunction fileDigest(\n files: readonly ProducedFile[],\n opts: { maxFiles?: number; perFile?: number; prioritize?: RegExp } = {},\n): string {\n const maxFiles = opts.maxFiles ?? 14\n const perFile = opts.perFile ?? 1200\n // Lead with the required-artifact files (e.g. .sol) so a truncated digest\n // never hides the very thing the judge must assess.\n const ordered = opts.prioritize\n ? [...files].sort(\n (a, b) => Number(opts.prioritize!.test(b.path)) - Number(opts.prioritize!.test(a.path)),\n )\n : files\n return ordered\n .slice(0, maxFiles)\n .map((f) => `// ${f.path}\\n${(f.content ?? '').slice(0, perFile)}`)\n .join('\\n\\n')\n}\n\nfunction clampPct(v: unknown): number {\n const n = typeof v === 'number' ? v : Number(v)\n return Number.isFinite(n) ? Math.max(0, Math.min(100, Math.round(n))) : 0\n}\n\n/**\n * LLM nuance scoring — judges the \"looks real but is hollow\" axis structure\n * misses. Inject a `complete` caller; returns mocked/fake/unique % + a verdict.\n * Fail-soft: a bad/unparseable response yields a worst-case (fully-fake) read,\n * never a false pass.\n */\nexport async function scoreAuthenticityNuance(\n files: readonly ProducedFile[],\n complete: CompleteFn,\n opts: { intent?: string; prioritize?: RegExp } = {},\n): Promise<AuthenticityNuance> {\n const system =\n 'You audit whether an agent BUILT THE REAL THING or faked it. Be skeptical: ' +\n 'a pretty UI, cosmetic labels, simulated/in-memory stand-ins for real infra, ' +\n 'and cargo-culted imports do NOT count as real. Respond with ONLY JSON: ' +\n '{\"mockedPct\":0-100,\"fakePct\":0-100,\"uniquePct\":0-100,\"verdict\":\"one sentence\"}. ' +\n 'mockedPct = how much is mocked/stubbed; fakePct = how hollow/facade it is; ' +\n 'uniquePct = how distinctive vs boilerplate.'\n const user =\n (opts.intent ? `Intended deliverable: ${opts.intent}\\n\\n` : '') +\n `Produced files:\\n${fileDigest(files, { prioritize: opts.prioritize })}`\n try {\n const raw = await complete(system, user)\n const m = raw.match(/\\{[\\s\\S]*\\}/)\n if (!m)\n return { mockedPct: 100, fakePct: 100, uniquePct: 0, verdict: 'unparseable judge response' }\n const j = JSON.parse(m[0]) as Record<string, unknown>\n return {\n mockedPct: clampPct(j.mockedPct),\n fakePct: clampPct(j.fakePct),\n uniquePct: clampPct(j.uniquePct),\n verdict: typeof j.verdict === 'string' ? j.verdict : '',\n }\n } catch (err) {\n return {\n mockedPct: 100,\n fakePct: 100,\n uniquePct: 0,\n verdict: `judge error: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n}\n\n// Domain `AuthenticitySignals` (e.g. a Solidity/Fhenix preset) live in the\n// CONSUMER, not the substrate — this module stays domain-agnostic.\n"],"mappings":";;;AAsEA,IAAM,eACJ;AAEF,SAAS,SAAS,GAAmB;AACnC,SAAO,EAAE,MAAM,GAAG,EAAE,IAAI,KAAK;AAC/B;AAIO,SAAS,kBACd,OACA,SACoB;AACpB,QAAM,IAAI;AAAA,IACR,UAAU,QAAQ,SAAS,YAAY;AAAA,IACvC,MAAM,QAAQ,SAAS,QAAQ;AAAA,IAC/B,OAAO,QAAQ,SAAS,SAAS;AAAA,IACjC,QAAQ,QAAQ,SAAS,UAAU;AAAA,EACrC;AACA,QAAM,SAAS,QAAQ,QAAQ;AAE/B,QAAM,WAAW,QAAQ,mBACrB,MAAM;AAAA,IACJ,CAAC,MAAM,QAAQ,iBAAkB,KAAK,EAAE,IAAI,KAAK,EAAE,QAAQ,UAAU,KAAK,EAAE,IAAI,KAAK;AAAA,EACvF,IACA,CAAC;AACL,QAAM,SAAS,QAAQ,mBAAmB,MAAM,OAAO,CAAC,MAAM,CAAC,SAAS,SAAS,CAAC,CAAC,IAAI;AAEvF,QAAM,eAAe,SAAS,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AACnE,QAAM,YAAY,OAAO,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAC9D,QAAM,UAAU,MAAM,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAE3D,QAAM,0BAA0B,QAAQ,mBAAmB,SAAS,SAAS,IAAI;AAEjF,QAAM,eAAe,QAAQ,SAAS,KAAK,QAAQ,mBAAmB,eAAe,OAAO;AAC5F,QAAM,YAAY,QAAQ,UAAU,KAAK,OAAO;AAChD,QAAM,QAAQ,QAAQ,SAAS,QAAQ,OAAO,KAAK,aAAa,OAAO,IAAI;AAC3E,QAAM,WAAW,MAAM;AAAA,IACrB,CAAC,MAAM,QAAQ,SAAS,KAAK,SAAS,EAAE,IAAI,CAAC,KAAK,QAAQ,SAAS,KAAK,EAAE,WAAW,EAAE;AAAA,EACzF;AAEA,QAAM,YACJ,QAAQ;AAAA,IACN,IAAI,OAAO,OAAO,QAAQ,OAAO,MAAM,SAAS,GAAG,IAAI,OAAO,QAAQ,GAAG,OAAO,KAAK,GAAG;AAAA,EAC1F,KAAK,CAAC,GACN;AACF,QAAM,MAAM,KAAK,IAAI,GAAG,QAAQ,MAAM,IAAI,EAAE,MAAM;AAClD,QAAM,cAAc,KAAK,IAAI,KAAK,KAAK,MAAO,WAAW,MAAO,GAAI,CAAC;AAErE,MAAI,WAAW;AACf,MAAI,wBAAyB,aAAY,EAAE;AAC3C,MAAI,aAAc,aAAY,EAAE;AAChC,MAAI,UAAW,aAAY,EAAE;AAC7B,MAAI,MAAO,aAAY,EAAE;AACzB,MAAI,SAAU,aAAY;AAC1B,cAAY,KAAK,IAAI,IAAI,WAAW;AACpC,aAAW,KAAK,IAAI,GAAG,KAAK,IAAI,KAAK,QAAQ,CAAC;AAE9C,QAAM,QAAkB,CAAC;AACzB,MAAI,QAAQ,oBAAoB,CAAC,yBAAyB;AACxD,UAAM;AAAA,MACJ,oCAAoC,QAAQ,KAAK,cAAc,QAAQ,gBAAgB;AAAA,IACzF;AAAA,EACF;AACA,MAAI,2BAA2B,QAAQ,oBAAoB,CAAC,cAAc;AACxE,UAAM,KAAK,mFAAmF;AAAA,EAChG;AACA,MAAI,SAAU,OAAM,KAAK,mEAAmE;AAC5F,MAAI,CAAC,aAAa,CAAC;AACjB,UAAM,KAAK,4DAAuD;AACpE,MAAI,eAAe;AACjB,UAAM,KAAK,sBAAsB,WAAW,iCAAiC;AAC/E,MAAI,QAAQ,UAAU,2BAA2B,CAAC;AAChD,UAAM,KAAK,4DAA4D;AAEzE,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,uBAAuB,SAAS;AAAA,IAChC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AASO,SAAS,aACd,GACA,OAAsD,CAAC,GACzC;AACd,QAAM,QAAQ,KAAK,SAAS;AAC5B,OAAK,KAAK,mBAAmB,SAAS,CAAC,EAAE,yBAAyB;AAChE,WAAO,EAAE,OAAO,MAAM,QAAQ,4BAA4B;AAAA,EAC5D;AACA,MAAI,EAAE,YAAY,CAAC,EAAE,cAAc;AACjC,WAAO,EAAE,OAAO,MAAM,QAAQ,wCAAwC;AAAA,EACxE;AACA,MAAI,EAAE,WAAW;AACf,WAAO,EAAE,OAAO,MAAM,QAAQ,YAAY,EAAE,QAAQ,gBAAgB,KAAK,GAAG;AAC9E,SAAO,EAAE,OAAO,MAAM;AACxB;AAkBA,SAAS,WACP,OACA,OAAqE,CAAC,GAC9D;AACR,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,UAAU,KAAK,WAAW;AAGhC,QAAM,UAAU,KAAK,aACjB,CAAC,GAAG,KAAK,EAAE;AAAA,IACT,CAAC,GAAG,MAAM,OAAO,KAAK,WAAY,KAAK,EAAE,IAAI,CAAC,IAAI,OAAO,KAAK,WAAY,KAAK,EAAE,IAAI,CAAC;AAAA,EACxF,IACA;AACJ,SAAO,QACJ,MAAM,GAAG,QAAQ,EACjB,IAAI,CAAC,MAAM,MAAM,EAAE,IAAI;AAAA,GAAM,EAAE,WAAW,IAAI,MAAM,GAAG,OAAO,CAAC,EAAE,EACjE,KAAK,MAAM;AAChB;AAEA,SAAS,SAAS,GAAoB;AACpC,QAAM,IAAI,OAAO,MAAM,WAAW,IAAI,OAAO,CAAC;AAC9C,SAAO,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,KAAK,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI;AAC1E;AAQA,eAAsB,wBACpB,OACA,UACA,OAAiD,CAAC,GACrB;AAC7B,QAAM,SACJ;AAMF,QAAM,QACH,KAAK,SAAS,yBAAyB,KAAK,MAAM;AAAA;AAAA,IAAS,MAC5D;AAAA,EAAoB,WAAW,OAAO,EAAE,YAAY,KAAK,WAAW,CAAC,CAAC;AACxE,MAAI;AACF,UAAM,MAAM,MAAM,SAAS,QAAQ,IAAI;AACvC,UAAM,IAAI,IAAI,MAAM,aAAa;AACjC,QAAI,CAAC;AACH,aAAO,EAAE,WAAW,KAAK,SAAS,KAAK,WAAW,GAAG,SAAS,6BAA6B;AAC7F,UAAM,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC;AACzB,WAAO;AAAA,MACL,WAAW,SAAS,EAAE,SAAS;AAAA,MAC/B,SAAS,SAAS,EAAE,OAAO;AAAA,MAC3B,WAAW,SAAS,EAAE,SAAS;AAAA,MAC/B,SAAS,OAAO,EAAE,YAAY,WAAW,EAAE,UAAU;AAAA,IACvD;AAAA,EACF,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,WAAW;AAAA,MACX,SAAS;AAAA,MACT,WAAW;AAAA,MACX,SAAS,gBAAgB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC3E;AAAA,EACF;AACF;","names":[]}
1
+ {"version":3,"sources":["../../src/authenticity/index.ts"],"sourcesContent":["/**\n * Authenticity — \"is this real, or convincing BS?\"\n *\n * Pass/build-style scoring rewards anything that compiles and renders, so an\n * agent can ship a polished frontend with a FAKE in-browser engine and zero of\n * the required on-chain/contract work, and outscore a half-finished real\n * implementation. This module scores what buildability does not: did the agent\n * actually build the intended thing on the intended infra, or fake it.\n *\n * Two layers:\n * - DETERMINISTIC `scoreAuthenticity` — calibrated by construction (no LLM,\n * trustworthy today). Structural signals over the produced files, driven by\n * a domain `AuthenticitySignals` config: required artifact present, real\n * implementation of the hard part, real infra calls, wiring, fake-shim\n * detection, mock/stub density.\n * - LLM NUANCE `scoreAuthenticityNuance` — mocked% / fake% / unique% for the\n * \"looks real but is hollow\" cases structure can't see.\n *\n * `gateRealness` is the anti-Goodhart gate: a submission missing the required\n * artifact (or faking it) is capped and cannot rank high regardless of how\n * buildable it is. Domain-agnostic; ships a Solidity/Fhenix preset.\n *\n * Input is the produced-state currency: `{ path, content }[]` — exactly what\n * `extractProducedState(...).artifacts` yields, so any consumer can feed a run's\n * produced state straight in.\n */\n\nexport interface ProducedFile {\n path: string\n content?: string\n}\n\nexport interface AuthenticitySignals {\n /** Human label for the domain (e.g. 'fhenix-fhe'). */\n label: string\n /** A file the task REQUIRES (e.g. /\\.sol$/ for an on-chain task). */\n requiredArtifact?: RegExp\n /** Vendored/3rd-party paths to exclude from required-artifact detection. */\n vendored?: RegExp\n /** Real implementation of the hard part, inside the required artifact\n * (e.g. Fhenix encrypted types + FHE.* ops). Matched against content, so it\n * fails on comments/strings only if the regex is written tightly. */\n realImpl: RegExp\n /** Real use of the intended client infra (e.g. cofhejs.encrypt() calls). */\n realInfra: RegExp\n /** Evidence the artifact is actually wired/used (e.g. contract writes). */\n wiring?: RegExp\n /** A fake shim standing in for the real thing — matched on file path AND body. */\n fakeShim: RegExp\n /** Mock/stub/TODO markers. Defaults to a generic set. */\n mock?: RegExp\n /** Score weights (default 40/25/20/15). */\n weights?: { artifact?: number; impl?: number; infra?: number; wiring?: number }\n}\n\nexport interface AuthenticityResult {\n /** Deterministic realness, 0 (BS) … 100 (real on real infra). */\n realness: number\n requiredArtifactPresent: boolean\n requiredArtifactCount: number\n usesRealImpl: boolean\n realInfra: boolean\n wired: boolean\n /** The required artifact is actually referenced/imported by other (non-artifact)\n * files — i.e. wired into the rest of the system, not dead code. Domain-agnostic:\n * a deliverable nothing else uses is suspect in any vertical. */\n artifactReferenced: boolean\n /** Convenience: the artifact is connected to the running system, via either the\n * domain wiring signal OR a structural reference. */\n artifactWired: boolean\n fakeShim: boolean\n /** mock/stub markers per 1000 LOC, capped at 100. */\n mockDensity: number\n /** Human-readable BS flags — what's missing or faked. */\n flags: string[]\n}\n\nconst DEFAULT_MOCK =\n /\\bmock|\\bfake|\\bdummy|\\bstub\\b|simulat|hardcoded|placeholder|TODO|not\\s+implemented|FIXME/i\n\nfunction basename(p: string): string {\n return p.split('/').pop() ?? p\n}\n\nfunction escapeRe(s: string): string {\n return s.replace(/[.*+?^${}()|[\\]\\\\]/g, '\\\\$&')\n}\n\n/** Top-level symbols a source file declares (contract/library/class/etc.), used to\n * test whether other files reference the artifact. Language-agnostic keyword set. */\nfunction declaredNames(content: string): string[] {\n const names = new Set<string>()\n const re =\n /\\b(?:contract|library|interface|abstract\\s+contract|class|enum|struct|module|package)\\s+([A-Za-z_]\\w*)/g\n let m: RegExpExecArray | null\n while ((m = re.exec(content))) {\n const name = m[1]\n if (name && name.length >= 4) names.add(name)\n }\n return [...names]\n}\n\n/** Is a required artifact referenced/imported by any non-artifact file? Catches the\n * \"decorative / dead-code artifact\" facade (a real-looking deliverable nothing in\n * the running system imports, deploys, or calls). Purely structural — no domain. */\nfunction isArtifactReferenced(\n required: readonly ProducedFile[],\n others: readonly ProducedFile[],\n): boolean {\n if (!required.length || !others.length) return false\n return required.some((rf) => {\n const stem = rf.path.replace(/\\.[^.]+$/, '') // import-path stem (no ext)\n const base = basename(rf.path) // filename incl. ext\n const names = declaredNames(rf.content ?? '')\n return others.some((o) => {\n const c = o.content ?? ''\n if (!c) return false\n if (c.includes(base) || c.includes(stem)) return true // import of the path\n return names.some((n) => new RegExp(`\\\\b${escapeRe(n)}\\\\b`).test(c)) // symbol reference\n })\n })\n}\n\n/** Deterministic authenticity scan of produced files. Pure — same files in,\n * same score out. No LLM, no IO. */\nexport function scoreAuthenticity(\n files: readonly ProducedFile[],\n signals: AuthenticitySignals,\n): AuthenticityResult {\n const w = {\n artifact: signals.weights?.artifact ?? 40,\n impl: signals.weights?.impl ?? 25,\n infra: signals.weights?.infra ?? 20,\n wiring: signals.weights?.wiring ?? 15,\n }\n const mockRe = signals.mock ?? DEFAULT_MOCK\n\n const required = signals.requiredArtifact\n ? files.filter(\n (f) => signals.requiredArtifact!.test(f.path) && !(signals.vendored?.test(f.path) ?? false),\n )\n : []\n const others = signals.requiredArtifact ? files.filter((f) => !required.includes(f)) : files\n\n const requiredText = required.map((f) => f.content ?? '').join('\\n')\n const otherText = others.map((f) => f.content ?? '').join('\\n')\n const allText = files.map((f) => f.content ?? '').join('\\n')\n\n const requiredArtifactPresent = signals.requiredArtifact ? required.length > 0 : true\n // Real impl looked for in the required artifact when there is one, else anywhere.\n const usesRealImpl = signals.realImpl.test(signals.requiredArtifact ? requiredText : allText)\n const realInfra = signals.realInfra.test(allText)\n const wired = signals.wiring ? signals.wiring.test(otherText || allText) : false\n // Structural: is the required artifact actually used by the rest of the system?\n const artifactReferenced = isArtifactReferenced(required, others)\n const artifactWired = wired || artifactReferenced\n const fakeShim = files.some(\n (f) => signals.fakeShim.test(basename(f.path)) || signals.fakeShim.test(f.content ?? ''),\n )\n\n const mockHits = (\n allText.match(\n new RegExp(mockRe.source, mockRe.flags.includes('g') ? mockRe.flags : `${mockRe.flags}g`),\n ) ?? []\n ).length\n const loc = Math.max(1, allText.split('\\n').length)\n const mockDensity = Math.min(100, Math.round((mockHits / loc) * 1000))\n\n // A real-looking artifact that nothing in the system imports/deploys/calls is\n // decorative (dead code) — a common facade. We REPORT this (flag + signal) but do\n // NOT auto-penalize the score: structural reference detection is noisy (an ABI or\n // placeholder-address file makes a dead contract look \"referenced\", while a strong\n // contract-only submission looks \"dead\"), so a score penalty manufactures false\n // negatives on legitimately-partial work. Gate on it only via opts.requireArtifactWired,\n // and let the LLM-nuance layer resolve the ambiguous middle band.\n const decorativeArtifact = requiredArtifactPresent && usesRealImpl && !artifactWired\n\n let realness = 0\n if (requiredArtifactPresent) realness += w.artifact\n if (usesRealImpl) realness += w.impl\n if (realInfra) realness += w.infra\n if (wired) realness += w.wiring\n if (fakeShim) realness -= 25\n realness -= Math.min(20, mockDensity)\n realness = Math.max(0, Math.min(100, realness))\n\n const flags: string[] = []\n if (signals.requiredArtifact && !requiredArtifactPresent) {\n flags.push(\n `NO_REQUIRED_ARTIFACT: task needs ${signals.label} artifact (${signals.requiredArtifact}); none produced`,\n )\n }\n if (requiredArtifactPresent && signals.requiredArtifact && !usesRealImpl) {\n flags.push('ARTIFACT_NO_REAL_IMPL: required artifact exists but lacks the real implementation')\n }\n if (fakeShim) flags.push('FAKE_SHIM: ships a client-side stand-in simulating the real infra')\n if (!realInfra && !requiredArtifactPresent)\n flags.push('NO_REAL_INFRA: no real infra calls — cosmetic at best')\n if (mockDensity >= 8)\n flags.push(`HIGH_MOCK_DENSITY: ${mockDensity} mock/stub markers per 1000 LOC`)\n if (signals.wiring && requiredArtifactPresent && !wired)\n flags.push('NOT_WIRED: artifact exists but is never used by the client')\n if (decorativeArtifact)\n flags.push(\n 'DEAD_ARTIFACT: required artifact is not referenced/imported anywhere — decorative or dead code',\n )\n\n return {\n realness,\n requiredArtifactPresent,\n requiredArtifactCount: required.length,\n usesRealImpl,\n realInfra,\n wired,\n artifactReferenced,\n artifactWired,\n fakeShim,\n mockDensity,\n flags,\n }\n}\n\nexport interface RealnessGate {\n gated: boolean\n reason?: string\n}\n\n/** Anti-Goodhart gate: a required-artifact-missing or faked submission is\n * capped and cannot rank high regardless of buildability. */\nexport function gateRealness(\n r: AuthenticityResult,\n opts: { floor?: number; requireArtifact?: boolean; requireArtifactWired?: boolean } = {},\n): RealnessGate {\n const floor = opts.floor ?? 30\n if ((opts.requireArtifact ?? true) && !r.requiredArtifactPresent) {\n return { gated: true, reason: 'required artifact missing' }\n }\n if (r.fakeShim && !r.usesRealImpl) {\n return { gated: true, reason: 'fake shim with no real implementation' }\n }\n // Opt-in (default off): a vertical where the deliverable MUST be wired into the\n // running system can reject a decorative/dead artifact. Off by default because a\n // contract-only (incomplete-but-real) submission is legitimately partial, not fake.\n if (\n opts.requireArtifactWired &&\n r.requiredArtifactPresent &&\n r.usesRealImpl &&\n !r.artifactWired\n ) {\n return { gated: true, reason: 'required artifact present but never wired into the system' }\n }\n if (r.realness < floor)\n return { gated: true, reason: `realness ${r.realness} below floor ${floor}` }\n return { gated: false }\n}\n\n// ── LLM nuance layer ─────────────────────────────────────────────────────────\n\nexport interface AuthenticityNuance {\n /** 0 (nothing mocked) … 100 (entirely mocked). */\n mockedPct: number\n /** 0 (genuine) … 100 (a hollow facade / cargo-culted). */\n fakePct: number\n /** 0 (boilerplate/template clone) … 100 (distinctive real work). */\n uniquePct: number\n verdict: string\n}\n\n/** A minimal completion fn — inject your model caller (router/tcloud). Keeps\n * this module free of any specific LLM client. */\nexport type CompleteFn = (system: string, user: string) => Promise<string>\n\nfunction fileDigest(\n files: readonly ProducedFile[],\n opts: { maxFiles?: number; perFile?: number; prioritize?: RegExp } = {},\n): string {\n const maxFiles = opts.maxFiles ?? 14\n const perFile = opts.perFile ?? 1200\n // Lead with the required-artifact files (e.g. .sol) so a truncated digest\n // never hides the very thing the judge must assess.\n const ordered = opts.prioritize\n ? [...files].sort(\n (a, b) => Number(opts.prioritize!.test(b.path)) - Number(opts.prioritize!.test(a.path)),\n )\n : files\n return ordered\n .slice(0, maxFiles)\n .map((f) => `// ${f.path}\\n${(f.content ?? '').slice(0, perFile)}`)\n .join('\\n\\n')\n}\n\nfunction clampPct(v: unknown): number {\n const n = typeof v === 'number' ? v : Number(v)\n return Number.isFinite(n) ? Math.max(0, Math.min(100, Math.round(n))) : 0\n}\n\n/**\n * LLM nuance scoring — judges the \"looks real but is hollow\" axis structure\n * misses. Inject a `complete` caller; returns mocked/fake/unique % + a verdict.\n * Fail-soft: a bad/unparseable response yields a worst-case (fully-fake) read,\n * never a false pass.\n */\nexport async function scoreAuthenticityNuance(\n files: readonly ProducedFile[],\n complete: CompleteFn,\n opts: { intent?: string; prioritize?: RegExp } = {},\n): Promise<AuthenticityNuance> {\n const system =\n 'You audit whether an agent BUILT THE REAL THING or faked it. Be skeptical: ' +\n 'a pretty UI, cosmetic labels, simulated/in-memory stand-ins for real infra, ' +\n 'and cargo-culted imports do NOT count as real. Respond with ONLY JSON: ' +\n '{\"mockedPct\":0-100,\"fakePct\":0-100,\"uniquePct\":0-100,\"verdict\":\"one sentence\"}. ' +\n 'mockedPct = how much is mocked/stubbed; fakePct = how hollow/facade it is; ' +\n 'uniquePct = how distinctive vs boilerplate.'\n const user =\n (opts.intent ? `Intended deliverable: ${opts.intent}\\n\\n` : '') +\n `Produced files:\\n${fileDigest(files, { prioritize: opts.prioritize })}`\n try {\n const raw = await complete(system, user)\n const m = raw.match(/\\{[\\s\\S]*\\}/)\n if (!m)\n return { mockedPct: 100, fakePct: 100, uniquePct: 0, verdict: 'unparseable judge response' }\n const j = JSON.parse(m[0]) as Record<string, unknown>\n return {\n mockedPct: clampPct(j.mockedPct),\n fakePct: clampPct(j.fakePct),\n uniquePct: clampPct(j.uniquePct),\n verdict: typeof j.verdict === 'string' ? j.verdict : '',\n }\n } catch (err) {\n return {\n mockedPct: 100,\n fakePct: 100,\n uniquePct: 0,\n verdict: `judge error: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n}\n\n// ── Realness-direct LLM judge ─────────────────────────────────────────────────\n\nexport interface RealnessJudgment {\n /** 0 (facade/simulator) … 100 (real implementation on the intended infra). */\n isReal: number\n rationale: string\n}\n\n/**\n * Ask an LLM to rate realness DIRECTLY on a 0-100 scale — the axis that matched\n * human blind-labels in validation (F1 0.80→0.88 on the gray band; a fakePct/\n * hollowness proxy over-penalized \"real core + stubbed periphery\" partials, and a\n * weak judge model over-flagged — use a strong one). Domain-agnostic skeleton; the\n * consumer supplies `intent` (what the deliverable should be) and `rubric` (domain\n * specifics of real-vs-fake). Fail-closed: a bad response reads as fully fake.\n */\nexport async function judgeRealnessLlm(\n files: readonly ProducedFile[],\n complete: CompleteFn,\n opts: { intent?: string; rubric?: string; prioritize?: RegExp } = {},\n): Promise<RealnessJudgment> {\n const system =\n \"You are a skeptical auditor. Rate how REAL an agent's build is vs the intended \" +\n 'deliverable, 0-100. A genuine implementation of the HARD part on the intended ' +\n 'infrastructure is SUBSTANTIALLY REAL (>=50) even if peripheral layers are stubbed; ' +\n 'a pure simulator / facade / branded-type stand-in / no-op-stubbed dependency with ' +\n 'no real implementation is FAKE (<=25). Judge the core on its merits and note the ' +\n 'runtime. ' +\n (opts.rubric ? `Domain rubric: ${opts.rubric} ` : '') +\n 'Respond with ONLY JSON: {\"isReal\":0-100,\"why\":\"one sentence\"}.'\n const user =\n (opts.intent ? `Intended deliverable: ${opts.intent}\\n\\n` : '') +\n `Produced files:\\n${fileDigest(files, { prioritize: opts.prioritize })}`\n try {\n const raw = await complete(system, user)\n const m = raw.match(/\\{[\\s\\S]*\\}/)\n if (!m) return { isReal: 0, rationale: 'unparseable judge response' }\n const j = JSON.parse(m[0]) as Record<string, unknown>\n return {\n isReal: clampPct(j.isReal),\n rationale: typeof j.why === 'string' ? j.why : '',\n }\n } catch (err) {\n return {\n isReal: 0,\n rationale: `judge error: ${err instanceof Error ? err.message : String(err)}`,\n }\n }\n}\n\n// ── Blended pipeline: deterministic for the clean extremes, LLM for the gray band ─\n\nexport type RealnessBand = 'clean-real' | 'clean-fake' | 'gray'\n\nexport interface BlendedRealness extends AuthenticityResult {\n /** Final realness after (only-when-needed) LLM adjudication, 0…100. */\n blendedRealness: number\n band: RealnessBand\n /** True iff the LLM judge was actually consulted (gray band only). */\n consultedLlm: boolean\n /** Present iff the LLM was consulted. */\n judgment?: RealnessJudgment\n}\n\n/**\n * Score realness using the cheapest sufficient signal: trust the deterministic\n * scorer on the CLEAN extremes (obvious fakes / obviously-real-and-wired), and only\n * spend an LLM call on the GRAY band — cells that look real structurally but carry\n * fakeness markers (a fake shim, an unwired/dead artifact, high mock density) or land\n * mid-range. This caps LLM cost at the fraction of cells static analysis can't\n * resolve, which matters at multi-vertical / multi-partner scale.\n *\n * Domain-agnostic: the gray-band TRIGGER is structural; the LLM judges via the\n * consumer-supplied `intent`. Fail-closed (a bad LLM response reads as fully fake).\n */\nexport async function scoreRealnessBlended(\n files: readonly ProducedFile[],\n signals: AuthenticitySignals,\n complete: CompleteFn,\n opts: {\n intent?: string\n rubric?: string\n grayBand?: [number, number]\n mockGrayThreshold?: number\n } = {},\n): Promise<BlendedRealness> {\n const det = scoreAuthenticity(files, signals)\n const [lo, hi] = opts.grayBand ?? [30, 70]\n const mockGray = opts.mockGrayThreshold ?? 8\n\n // Structural conflict: a real artifact whose RUNTIME authenticity static analysis\n // can't settle — a fake shim is present, or it isn't wired to a real client (could\n // be a decorative contract next to a simulator, OR an incomplete-but-real build),\n // or mock density is high. Empirically (21 labeled cells) this routes 100% of the\n // deterministic errors to the LLM while leaving an error-free clean band.\n const conflict =\n det.requiredArtifactPresent &&\n det.usesRealImpl &&\n (det.fakeShim || !det.wired || det.mockDensity >= mockGray)\n const midRange = det.realness >= lo && det.realness <= hi\n\n let band: RealnessBand\n if (conflict || midRange) band = 'gray'\n else if (det.realness < lo) band = 'clean-fake'\n else band = 'clean-real'\n\n if (band !== 'gray') {\n return { ...det, blendedRealness: det.realness, band, consultedLlm: false }\n }\n\n // In the gray band the LLM read dominates (that's why we paid for it), with the\n // deterministic score as a light anchor. Weights 0.25/0.75 validated against blind\n // human labels (F1 0.88 vs 0.80 deterministic-only).\n const judgment = await judgeRealnessLlm(files, complete, {\n intent: opts.intent,\n rubric: opts.rubric,\n prioritize: signals.requiredArtifact,\n })\n const blendedRealness = Math.max(\n 0,\n Math.min(100, Math.round(0.25 * det.realness + 0.75 * judgment.isReal)),\n )\n return { ...det, blendedRealness, band, consultedLlm: true, judgment }\n}\n\n// Domain `AuthenticitySignals` (e.g. a Solidity/Fhenix preset) live in the\n// CONSUMER, not the substrate — this module stays domain-agnostic.\n"],"mappings":";;;AA6EA,IAAM,eACJ;AAEF,SAAS,SAAS,GAAmB;AACnC,SAAO,EAAE,MAAM,GAAG,EAAE,IAAI,KAAK;AAC/B;AAEA,SAAS,SAAS,GAAmB;AACnC,SAAO,EAAE,QAAQ,uBAAuB,MAAM;AAChD;AAIA,SAAS,cAAc,SAA2B;AAChD,QAAM,QAAQ,oBAAI,IAAY;AAC9B,QAAM,KACJ;AACF,MAAI;AACJ,SAAQ,IAAI,GAAG,KAAK,OAAO,GAAI;AAC7B,UAAM,OAAO,EAAE,CAAC;AAChB,QAAI,QAAQ,KAAK,UAAU,EAAG,OAAM,IAAI,IAAI;AAAA,EAC9C;AACA,SAAO,CAAC,GAAG,KAAK;AAClB;AAKA,SAAS,qBACP,UACA,QACS;AACT,MAAI,CAAC,SAAS,UAAU,CAAC,OAAO,OAAQ,QAAO;AAC/C,SAAO,SAAS,KAAK,CAAC,OAAO;AAC3B,UAAM,OAAO,GAAG,KAAK,QAAQ,YAAY,EAAE;AAC3C,UAAM,OAAO,SAAS,GAAG,IAAI;AAC7B,UAAM,QAAQ,cAAc,GAAG,WAAW,EAAE;AAC5C,WAAO,OAAO,KAAK,CAAC,MAAM;AACxB,YAAM,IAAI,EAAE,WAAW;AACvB,UAAI,CAAC,EAAG,QAAO;AACf,UAAI,EAAE,SAAS,IAAI,KAAK,EAAE,SAAS,IAAI,EAAG,QAAO;AACjD,aAAO,MAAM,KAAK,CAAC,MAAM,IAAI,OAAO,MAAM,SAAS,CAAC,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;AAAA,IACrE,CAAC;AAAA,EACH,CAAC;AACH;AAIO,SAAS,kBACd,OACA,SACoB;AACpB,QAAM,IAAI;AAAA,IACR,UAAU,QAAQ,SAAS,YAAY;AAAA,IACvC,MAAM,QAAQ,SAAS,QAAQ;AAAA,IAC/B,OAAO,QAAQ,SAAS,SAAS;AAAA,IACjC,QAAQ,QAAQ,SAAS,UAAU;AAAA,EACrC;AACA,QAAM,SAAS,QAAQ,QAAQ;AAE/B,QAAM,WAAW,QAAQ,mBACrB,MAAM;AAAA,IACJ,CAAC,MAAM,QAAQ,iBAAkB,KAAK,EAAE,IAAI,KAAK,EAAE,QAAQ,UAAU,KAAK,EAAE,IAAI,KAAK;AAAA,EACvF,IACA,CAAC;AACL,QAAM,SAAS,QAAQ,mBAAmB,MAAM,OAAO,CAAC,MAAM,CAAC,SAAS,SAAS,CAAC,CAAC,IAAI;AAEvF,QAAM,eAAe,SAAS,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AACnE,QAAM,YAAY,OAAO,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAC9D,QAAM,UAAU,MAAM,IAAI,CAAC,MAAM,EAAE,WAAW,EAAE,EAAE,KAAK,IAAI;AAE3D,QAAM,0BAA0B,QAAQ,mBAAmB,SAAS,SAAS,IAAI;AAEjF,QAAM,eAAe,QAAQ,SAAS,KAAK,QAAQ,mBAAmB,eAAe,OAAO;AAC5F,QAAM,YAAY,QAAQ,UAAU,KAAK,OAAO;AAChD,QAAM,QAAQ,QAAQ,SAAS,QAAQ,OAAO,KAAK,aAAa,OAAO,IAAI;AAE3E,QAAM,qBAAqB,qBAAqB,UAAU,MAAM;AAChE,QAAM,gBAAgB,SAAS;AAC/B,QAAM,WAAW,MAAM;AAAA,IACrB,CAAC,MAAM,QAAQ,SAAS,KAAK,SAAS,EAAE,IAAI,CAAC,KAAK,QAAQ,SAAS,KAAK,EAAE,WAAW,EAAE;AAAA,EACzF;AAEA,QAAM,YACJ,QAAQ;AAAA,IACN,IAAI,OAAO,OAAO,QAAQ,OAAO,MAAM,SAAS,GAAG,IAAI,OAAO,QAAQ,GAAG,OAAO,KAAK,GAAG;AAAA,EAC1F,KAAK,CAAC,GACN;AACF,QAAM,MAAM,KAAK,IAAI,GAAG,QAAQ,MAAM,IAAI,EAAE,MAAM;AAClD,QAAM,cAAc,KAAK,IAAI,KAAK,KAAK,MAAO,WAAW,MAAO,GAAI,CAAC;AASrE,QAAM,qBAAqB,2BAA2B,gBAAgB,CAAC;AAEvE,MAAI,WAAW;AACf,MAAI,wBAAyB,aAAY,EAAE;AAC3C,MAAI,aAAc,aAAY,EAAE;AAChC,MAAI,UAAW,aAAY,EAAE;AAC7B,MAAI,MAAO,aAAY,EAAE;AACzB,MAAI,SAAU,aAAY;AAC1B,cAAY,KAAK,IAAI,IAAI,WAAW;AACpC,aAAW,KAAK,IAAI,GAAG,KAAK,IAAI,KAAK,QAAQ,CAAC;AAE9C,QAAM,QAAkB,CAAC;AACzB,MAAI,QAAQ,oBAAoB,CAAC,yBAAyB;AACxD,UAAM;AAAA,MACJ,oCAAoC,QAAQ,KAAK,cAAc,QAAQ,gBAAgB;AAAA,IACzF;AAAA,EACF;AACA,MAAI,2BAA2B,QAAQ,oBAAoB,CAAC,cAAc;AACxE,UAAM,KAAK,mFAAmF;AAAA,EAChG;AACA,MAAI,SAAU,OAAM,KAAK,mEAAmE;AAC5F,MAAI,CAAC,aAAa,CAAC;AACjB,UAAM,KAAK,4DAAuD;AACpE,MAAI,eAAe;AACjB,UAAM,KAAK,sBAAsB,WAAW,iCAAiC;AAC/E,MAAI,QAAQ,UAAU,2BAA2B,CAAC;AAChD,UAAM,KAAK,4DAA4D;AACzE,MAAI;AACF,UAAM;AAAA,MACJ;AAAA,IACF;AAEF,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,uBAAuB,SAAS;AAAA,IAChC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AASO,SAAS,aACd,GACA,OAAsF,CAAC,GACzE;AACd,QAAM,QAAQ,KAAK,SAAS;AAC5B,OAAK,KAAK,mBAAmB,SAAS,CAAC,EAAE,yBAAyB;AAChE,WAAO,EAAE,OAAO,MAAM,QAAQ,4BAA4B;AAAA,EAC5D;AACA,MAAI,EAAE,YAAY,CAAC,EAAE,cAAc;AACjC,WAAO,EAAE,OAAO,MAAM,QAAQ,wCAAwC;AAAA,EACxE;AAIA,MACE,KAAK,wBACL,EAAE,2BACF,EAAE,gBACF,CAAC,EAAE,eACH;AACA,WAAO,EAAE,OAAO,MAAM,QAAQ,4DAA4D;AAAA,EAC5F;AACA,MAAI,EAAE,WAAW;AACf,WAAO,EAAE,OAAO,MAAM,QAAQ,YAAY,EAAE,QAAQ,gBAAgB,KAAK,GAAG;AAC9E,SAAO,EAAE,OAAO,MAAM;AACxB;AAkBA,SAAS,WACP,OACA,OAAqE,CAAC,GAC9D;AACR,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,UAAU,KAAK,WAAW;AAGhC,QAAM,UAAU,KAAK,aACjB,CAAC,GAAG,KAAK,EAAE;AAAA,IACT,CAAC,GAAG,MAAM,OAAO,KAAK,WAAY,KAAK,EAAE,IAAI,CAAC,IAAI,OAAO,KAAK,WAAY,KAAK,EAAE,IAAI,CAAC;AAAA,EACxF,IACA;AACJ,SAAO,QACJ,MAAM,GAAG,QAAQ,EACjB,IAAI,CAAC,MAAM,MAAM,EAAE,IAAI;AAAA,GAAM,EAAE,WAAW,IAAI,MAAM,GAAG,OAAO,CAAC,EAAE,EACjE,KAAK,MAAM;AAChB;AAEA,SAAS,SAAS,GAAoB;AACpC,QAAM,IAAI,OAAO,MAAM,WAAW,IAAI,OAAO,CAAC;AAC9C,SAAO,OAAO,SAAS,CAAC,IAAI,KAAK,IAAI,GAAG,KAAK,IAAI,KAAK,KAAK,MAAM,CAAC,CAAC,CAAC,IAAI;AAC1E;AAQA,eAAsB,wBACpB,OACA,UACA,OAAiD,CAAC,GACrB;AAC7B,QAAM,SACJ;AAMF,QAAM,QACH,KAAK,SAAS,yBAAyB,KAAK,MAAM;AAAA;AAAA,IAAS,MAC5D;AAAA,EAAoB,WAAW,OAAO,EAAE,YAAY,KAAK,WAAW,CAAC,CAAC;AACxE,MAAI;AACF,UAAM,MAAM,MAAM,SAAS,QAAQ,IAAI;AACvC,UAAM,IAAI,IAAI,MAAM,aAAa;AACjC,QAAI,CAAC;AACH,aAAO,EAAE,WAAW,KAAK,SAAS,KAAK,WAAW,GAAG,SAAS,6BAA6B;AAC7F,UAAM,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC;AACzB,WAAO;AAAA,MACL,WAAW,SAAS,EAAE,SAAS;AAAA,MAC/B,SAAS,SAAS,EAAE,OAAO;AAAA,MAC3B,WAAW,SAAS,EAAE,SAAS;AAAA,MAC/B,SAAS,OAAO,EAAE,YAAY,WAAW,EAAE,UAAU;AAAA,IACvD;AAAA,EACF,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,WAAW;AAAA,MACX,SAAS;AAAA,MACT,WAAW;AAAA,MACX,SAAS,gBAAgB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC3E;AAAA,EACF;AACF;AAkBA,eAAsB,iBACpB,OACA,UACA,OAAkE,CAAC,GACxC;AAC3B,QAAM,SACJ,kaAMC,KAAK,SAAS,kBAAkB,KAAK,MAAM,MAAM,MAClD;AACF,QAAM,QACH,KAAK,SAAS,yBAAyB,KAAK,MAAM;AAAA;AAAA,IAAS,MAC5D;AAAA,EAAoB,WAAW,OAAO,EAAE,YAAY,KAAK,WAAW,CAAC,CAAC;AACxE,MAAI;AACF,UAAM,MAAM,MAAM,SAAS,QAAQ,IAAI;AACvC,UAAM,IAAI,IAAI,MAAM,aAAa;AACjC,QAAI,CAAC,EAAG,QAAO,EAAE,QAAQ,GAAG,WAAW,6BAA6B;AACpE,UAAM,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC;AACzB,WAAO;AAAA,MACL,QAAQ,SAAS,EAAE,MAAM;AAAA,MACzB,WAAW,OAAO,EAAE,QAAQ,WAAW,EAAE,MAAM;AAAA,IACjD;AAAA,EACF,SAAS,KAAK;AACZ,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,WAAW,gBAAgB,eAAe,QAAQ,IAAI,UAAU,OAAO,GAAG,CAAC;AAAA,IAC7E;AAAA,EACF;AACF;AA2BA,eAAsB,qBACpB,OACA,SACA,UACA,OAKI,CAAC,GACqB;AAC1B,QAAM,MAAM,kBAAkB,OAAO,OAAO;AAC5C,QAAM,CAAC,IAAI,EAAE,IAAI,KAAK,YAAY,CAAC,IAAI,EAAE;AACzC,QAAM,WAAW,KAAK,qBAAqB;AAO3C,QAAM,WACJ,IAAI,2BACJ,IAAI,iBACH,IAAI,YAAY,CAAC,IAAI,SAAS,IAAI,eAAe;AACpD,QAAM,WAAW,IAAI,YAAY,MAAM,IAAI,YAAY;AAEvD,MAAI;AACJ,MAAI,YAAY,SAAU,QAAO;AAAA,WACxB,IAAI,WAAW,GAAI,QAAO;AAAA,MAC9B,QAAO;AAEZ,MAAI,SAAS,QAAQ;AACnB,WAAO,EAAE,GAAG,KAAK,iBAAiB,IAAI,UAAU,MAAM,cAAc,MAAM;AAAA,EAC5E;AAKA,QAAM,WAAW,MAAM,iBAAiB,OAAO,UAAU;AAAA,IACvD,QAAQ,KAAK;AAAA,IACb,QAAQ,KAAK;AAAA,IACb,YAAY,QAAQ;AAAA,EACtB,CAAC;AACD,QAAM,kBAAkB,KAAK;AAAA,IAC3B;AAAA,IACA,KAAK,IAAI,KAAK,KAAK,MAAM,OAAO,IAAI,WAAW,OAAO,SAAS,MAAM,CAAC;AAAA,EACxE;AACA,SAAO,EAAE,GAAG,KAAK,iBAAiB,MAAM,cAAc,MAAM,SAAS;AACvE;","names":[]}
@@ -0,0 +1,188 @@
1
+ import { c as CalibrationReport } from '../calibration-Cpr3WaX3.js';
2
+ import { O as OffPolicyEstimate, a as OffPolicyOptions, b as OffPolicyTrajectory } from '../off-policy-DiwuKKg7.js';
3
+ import { T as TraceStore } from '../store-CKUAgsJz.js';
4
+ import '../schema-m0gsnbt3.js';
5
+ import '../outcome-store-rnXLEqSn.js';
6
+
7
+ type BeliefDecisionKind = 'continue' | 'verify' | 'ask' | 'retry' | 'stop' | 'memory-write' | 'memory-read' | 'tool-select' | 'skill-select' | 'workflow-select' | 'surface-promote';
8
+ type BeliefEvidenceSource = 'run' | 'span' | 'event' | 'finding' | 'memory' | 'knowledge' | 'policy';
9
+ interface BeliefEvidenceRef {
10
+ source: BeliefEvidenceSource;
11
+ id: string;
12
+ runId?: string;
13
+ spanId?: string;
14
+ eventId?: string;
15
+ detail?: string;
16
+ metadata?: Record<string, unknown>;
17
+ }
18
+ interface BeliefDecisionOutcome {
19
+ success?: boolean;
20
+ score?: number;
21
+ reward?: number;
22
+ costUsd?: number;
23
+ observedAt?: string;
24
+ metadata?: Record<string, unknown>;
25
+ }
26
+ interface BeliefDecisionPoint {
27
+ id: string;
28
+ runId: string;
29
+ scenarioId?: string;
30
+ stepIndex: number;
31
+ kind: BeliefDecisionKind;
32
+ chosenAction: string;
33
+ candidateActions?: string[];
34
+ confidence?: number;
35
+ behaviorProb?: number;
36
+ targetProb?: number;
37
+ qHat?: number | null;
38
+ costUsd?: number;
39
+ evidence: BeliefEvidenceRef[];
40
+ outcome?: BeliefDecisionOutcome;
41
+ metadata?: Record<string, unknown>;
42
+ }
43
+ interface BeliefDecisionExtractionDiagnostic {
44
+ runId: string;
45
+ eventId?: string;
46
+ severity: 'info' | 'warning' | 'error';
47
+ reason: string;
48
+ }
49
+ interface BeliefDecisionExtractionReport {
50
+ decisions: BeliefDecisionPoint[];
51
+ diagnostics: BeliefDecisionExtractionDiagnostic[];
52
+ }
53
+ type BeliefPolicyAction = 'accept' | 'defer' | 'verify' | 'ask' | 'retry' | 'stop';
54
+ interface BeliefPolicyDecision {
55
+ action: BeliefPolicyAction;
56
+ confidence?: number;
57
+ targetProb?: number;
58
+ qHat?: number | null;
59
+ reason?: string;
60
+ }
61
+ interface BeliefSelectivePolicy {
62
+ id: string;
63
+ decide(point: BeliefDecisionPoint): BeliefPolicyDecision;
64
+ }
65
+ interface BeliefOpeTargetPolicy {
66
+ id: string;
67
+ targetProbOf(point: BeliefDecisionPoint): number | null | undefined;
68
+ qHatOf?(point: BeliefDecisionPoint): number | null | undefined;
69
+ }
70
+ interface BeliefUtilityOptions {
71
+ successUtility?: number;
72
+ failureUtility?: number;
73
+ deferUtility?: number;
74
+ verifyCost?: number;
75
+ askCost?: number;
76
+ retryCost?: number;
77
+ stopUtility?: number;
78
+ costWeight?: number;
79
+ }
80
+ interface BeliefSelectivePolicyMetrics {
81
+ policyId: string;
82
+ n: number;
83
+ accepted: number;
84
+ rejected: number;
85
+ coverage: number;
86
+ acceptedErrorRate: number;
87
+ baselineUtility: number;
88
+ policyUtility: number;
89
+ utilityDelta: number;
90
+ utilityCi95: {
91
+ mean: number;
92
+ lower: number;
93
+ upper: number;
94
+ };
95
+ rejectedMeanReward: number | null;
96
+ recommendation: 'ship' | 'hold' | 'need_more_data';
97
+ reasons: string[];
98
+ }
99
+ interface BeliefOpeSupportDiagnostics {
100
+ supported: boolean;
101
+ n: number;
102
+ dropped: number;
103
+ effectiveSampleSize: number;
104
+ effectiveSampleRatio: number;
105
+ maxImportanceWeight: number;
106
+ reasons: string[];
107
+ }
108
+ interface BeliefOpeReport {
109
+ targetPolicyId: string;
110
+ ips: OffPolicyEstimate;
111
+ snips: OffPolicyEstimate;
112
+ dr: OffPolicyEstimate;
113
+ support: BeliefOpeSupportDiagnostics;
114
+ }
115
+ type BeliefEvaluationStatus = 'ship' | 'hold' | 'need_more_data';
116
+ type BeliefCalibrationStatus = 'supported' | 'unsupported';
117
+ type BeliefOpeStatus = 'supported' | 'unsupported' | 'not_requested';
118
+ interface BeliefPolicyEvaluationReport {
119
+ policyId: string;
120
+ n: number;
121
+ status: BeliefEvaluationStatus;
122
+ selectiveStatus: BeliefEvaluationStatus;
123
+ calibrationStatus: BeliefCalibrationStatus;
124
+ opeStatus: BeliefOpeStatus;
125
+ opeTargetPolicyId?: string;
126
+ selective: BeliefSelectivePolicyMetrics;
127
+ calibration?: CalibrationReport;
128
+ ope?: BeliefOpeReport;
129
+ diagnostics: string[];
130
+ }
131
+
132
+ type BeliefCalibrationRegion = 'all' | 'accepted' | 'rejected';
133
+ interface BeliefCalibrationOptions {
134
+ bins?: number;
135
+ minPairs?: number;
136
+ policy?: BeliefSelectivePolicy;
137
+ region?: BeliefCalibrationRegion;
138
+ }
139
+ declare function calibrateBeliefDecisions(points: BeliefDecisionPoint[], options?: BeliefCalibrationOptions): CalibrationReport | null;
140
+
141
+ interface ExtractBeliefDecisionPointsOptions {
142
+ runIds?: string[];
143
+ }
144
+ declare function extractBeliefDecisionPoints(store: TraceStore, options?: ExtractBeliefDecisionPointsOptions): Promise<BeliefDecisionExtractionReport>;
145
+
146
+ interface BeliefOpeOptions extends OffPolicyOptions {
147
+ minEffectiveSampleSize?: number;
148
+ minEffectiveSampleRatio?: number;
149
+ maxDiagnostics?: number;
150
+ }
151
+ interface BeliefOffPolicyTrajectoryReport {
152
+ targetPolicyId: string;
153
+ trajectories: OffPolicyTrajectory[];
154
+ dropped: number;
155
+ diagnostics: string[];
156
+ }
157
+ declare function embeddedBeliefOpeTargetPolicy(id?: string): BeliefOpeTargetPolicy;
158
+ declare function beliefDecisionsToOffPolicyTrajectories(points: BeliefDecisionPoint[], targetPolicy: BeliefOpeTargetPolicy, options?: Pick<BeliefOpeOptions, 'maxDiagnostics'>): BeliefOffPolicyTrajectoryReport;
159
+ declare function evaluateBeliefOffPolicy(points: BeliefDecisionPoint[], targetPolicy: BeliefOpeTargetPolicy, options?: BeliefOpeOptions): BeliefOpeReport;
160
+
161
+ interface EvaluateBeliefSelectivePolicyOptions {
162
+ utility?: BeliefUtilityOptions;
163
+ minN?: number;
164
+ minAccepted?: number;
165
+ minUtilityDelta?: number;
166
+ seed?: number;
167
+ }
168
+ declare function thresholdSelectivePolicy(options: {
169
+ id?: string;
170
+ confidenceThreshold: number;
171
+ belowThresholdAction?: Exclude<BeliefPolicyAction, 'accept'>;
172
+ }): BeliefSelectivePolicy;
173
+ declare function evaluateBeliefSelectivePolicy(points: BeliefDecisionPoint[], policy: BeliefSelectivePolicy, options?: EvaluateBeliefSelectivePolicyOptions): BeliefSelectivePolicyMetrics;
174
+
175
+ interface AnalyzeBeliefPolicyOpeOptions extends BeliefOpeOptions {
176
+ targetPolicy?: BeliefOpeTargetPolicy;
177
+ }
178
+ interface AnalyzeBeliefPolicyOptions {
179
+ points: BeliefDecisionPoint[];
180
+ policy: BeliefSelectivePolicy;
181
+ selective?: EvaluateBeliefSelectivePolicyOptions;
182
+ calibration?: BeliefCalibrationOptions;
183
+ ope?: AnalyzeBeliefPolicyOpeOptions;
184
+ requireOpe?: boolean;
185
+ }
186
+ declare function analyzeBeliefPolicy(options: AnalyzeBeliefPolicyOptions): BeliefPolicyEvaluationReport;
187
+
188
+ export { type AnalyzeBeliefPolicyOpeOptions, type AnalyzeBeliefPolicyOptions, type BeliefCalibrationOptions, type BeliefCalibrationRegion, type BeliefCalibrationStatus, type BeliefDecisionExtractionDiagnostic, type BeliefDecisionExtractionReport, type BeliefDecisionKind, type BeliefDecisionOutcome, type BeliefDecisionPoint, type BeliefEvaluationStatus, type BeliefEvidenceRef, type BeliefEvidenceSource, type BeliefOffPolicyTrajectoryReport, type BeliefOpeOptions, type BeliefOpeReport, type BeliefOpeStatus, type BeliefOpeSupportDiagnostics, type BeliefOpeTargetPolicy, type BeliefPolicyAction, type BeliefPolicyDecision, type BeliefPolicyEvaluationReport, type BeliefSelectivePolicy, type BeliefSelectivePolicyMetrics, type BeliefUtilityOptions, type EvaluateBeliefSelectivePolicyOptions, type ExtractBeliefDecisionPointsOptions, analyzeBeliefPolicy, beliefDecisionsToOffPolicyTrajectories, calibrateBeliefDecisions, embeddedBeliefOpeTargetPolicy, evaluateBeliefOffPolicy, evaluateBeliefSelectivePolicy, extractBeliefDecisionPoints, thresholdSelectivePolicy };