@tangle-network/agent-eval 0.59.1 → 0.61.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/CHANGELOG.md +21 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/http.js +1 -1
  4. package/dist/adapters/langchain.d.ts +1 -1
  5. package/dist/adapters/langchain.js +1 -1
  6. package/dist/adapters/otel.d.ts +5 -5
  7. package/dist/adapters/otel.js +1 -1
  8. package/dist/agent-profile-9J9hxdm2.d.ts +114 -0
  9. package/dist/benchmarks/index.d.ts +3 -3
  10. package/dist/benchmarks/index.js +2 -2
  11. package/dist/builder-eval/index.js +3 -3
  12. package/dist/campaign/index.d.ts +153 -9
  13. package/dist/campaign/index.js +229 -23
  14. package/dist/campaign/index.js.map +1 -1
  15. package/dist/{chunk-QDOSODID.js → chunk-3B7Y5AUR.js} +2 -2
  16. package/dist/{chunk-QYJT52YW.js → chunk-3BFEG2F6.js} +1 -1
  17. package/dist/chunk-3BFEG2F6.js.map +1 -0
  18. package/dist/{chunk-J4DIMSRK.js → chunk-6EKXFFGQ.js} +2 -2
  19. package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
  20. package/dist/{chunk-63EPZQUZ.js → chunk-6REHLN5J.js} +2 -2
  21. package/dist/{chunk-GM476SZU.js → chunk-AIWHLG7J.js} +5 -5
  22. package/dist/{chunk-AIXHUIHG.js → chunk-B26KI423.js} +3 -3
  23. package/dist/{chunk-NCK5QLGT.js → chunk-F3SRAAZO.js} +2 -2
  24. package/dist/{chunk-N4SBKEPJ.js → chunk-GMXHLSLL.js} +107 -2
  25. package/dist/chunk-GMXHLSLL.js.map +1 -0
  26. package/dist/{chunk-VXNVVBZO.js → chunk-IHDHUN2X.js} +2 -2
  27. package/dist/{chunk-S3SDD56V.js → chunk-ITBRCT73.js} +2 -2
  28. package/dist/{chunk-OLIBRKRD.js → chunk-KX6F6NCG.js} +2 -2
  29. package/dist/{chunk-74Y2EMNH.js → chunk-OLULBECP.js} +18 -6
  30. package/dist/chunk-OLULBECP.js.map +1 -0
  31. package/dist/chunk-PQV2TKC3.js +27 -0
  32. package/dist/chunk-PQV2TKC3.js.map +1 -0
  33. package/dist/chunk-PZ5AY32C.js +10 -0
  34. package/dist/{chunk-UBPIXOC4.js → chunk-SBCB6VZY.js} +2 -2
  35. package/dist/chunk-SHTXZ4O2.js +113 -0
  36. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  37. package/dist/{chunk-JB4UWIM6.js → chunk-SUGME4OT.js} +266 -15
  38. package/dist/chunk-SUGME4OT.js.map +1 -0
  39. package/dist/{chunk-YTMXBHFM.js → chunk-T375SUOZ.js} +2 -2
  40. package/dist/{chunk-PIEAE33T.js → chunk-Z4ZCBC7M.js} +2 -2
  41. package/dist/cli.js +4 -4
  42. package/dist/contract/index.d.ts +48 -16
  43. package/dist/contract/index.js +59 -19
  44. package/dist/contract/index.js.map +1 -1
  45. package/dist/{control-DjEgwWNo.d.ts → control-Bf8owbuG.d.ts} +2 -2
  46. package/dist/control.d.ts +5 -5
  47. package/dist/control.js +4 -4
  48. package/dist/{dataset-BlwAtYYf.d.ts → dataset-B2kL-fSM.d.ts} +1 -1
  49. package/dist/{errors-mje_cKOs.d.ts → errors-Dwqw-T_m.d.ts} +1 -1
  50. package/dist/{feedback-trajectory-DpUmE90J.d.ts → feedback-trajectory-8hKC5EOb.d.ts} +1 -1
  51. package/dist/governance/index.d.ts +3 -3
  52. package/dist/governance/index.js +1 -1
  53. package/dist/hosted/index.d.ts +5 -5
  54. package/dist/hosted/index.js +1 -1
  55. package/dist/{index-wlaiph9Y.d.ts → index-Bvk35ils.d.ts} +1 -1
  56. package/dist/{index-D2nT6_KT.d.ts → index-D9dwa00f.d.ts} +2 -2
  57. package/dist/index.d.ts +24 -132
  58. package/dist/index.js +23 -36
  59. package/dist/index.js.map +1 -1
  60. package/dist/{integrity-CfXjSqEv.d.ts → integrity-CJzrpUua.d.ts} +1 -1
  61. package/dist/knowledge/index.js +1 -1
  62. package/dist/{llm-client-BXVRUZyX.d.ts → llm-client-DbjLfz-K.d.ts} +1 -1
  63. package/dist/matrix/index.js +1 -1
  64. package/dist/meta-eval/index.d.ts +3 -3
  65. package/dist/meta-eval/index.js +1 -1
  66. package/dist/multishot/index.js +1 -1
  67. package/dist/openapi.json +1 -1
  68. package/dist/pipelines/index.js +4 -4
  69. package/dist/prm/index.js +1 -1
  70. package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-D0WeCXt1.d.ts} +208 -6
  71. package/dist/{red-team-CrC5MZYd.d.ts → red-team-DW9Ca_tj.d.ts} +1 -1
  72. package/dist/{registry-DK9kqXvb.d.ts → registry-qmbYT3Eo.d.ts} +2 -2
  73. package/dist/{release-report-DmPjIce3.d.ts → release-report-DszkgvJ3.d.ts} +3 -3
  74. package/dist/reporting.d.ts +6 -6
  75. package/dist/reporting.js +5 -5
  76. package/dist/{researcher-JP8EvnLv.d.ts → researcher-BaVsy0sW.d.ts} +4 -4
  77. package/dist/rl.d.ts +9 -9
  78. package/dist/rl.js +8 -8
  79. package/dist/{rubric-predictive-validity-B3qNa4aY.d.ts → rubric-predictive-validity-DgBHWsh7.d.ts} +1 -1
  80. package/dist/run-campaign-HXPJAUZ3.js +10 -0
  81. package/dist/{run-record-etiCMsUq.d.ts → run-record-DgUVo5pw.d.ts} +1 -1
  82. package/dist/{summary-report-DLxh4yWk.d.ts → summary-report-BQvXpvaR.d.ts} +1 -1
  83. package/dist/telemetry/file.js +1 -1
  84. package/dist/telemetry/index.js +1 -1
  85. package/dist/traces.d.ts +2 -2
  86. package/dist/traces.js +4 -4
  87. package/dist/{types-BgrxOJSf.d.ts → types-Beb6KPqZ.d.ts} +52 -4
  88. package/dist/wire/index.d.ts +3 -3
  89. package/dist/wire/index.js +4 -4
  90. package/package.json +1 -1
  91. package/dist/chunk-74Y2EMNH.js.map +0 -1
  92. package/dist/chunk-JB4UWIM6.js.map +0 -1
  93. package/dist/chunk-N4SBKEPJ.js.map +0 -1
  94. package/dist/chunk-NSBPE2FW.js +0 -17
  95. package/dist/chunk-QYJT52YW.js.map +0 -1
  96. package/dist/chunk-ZWEQJIM6.js +0 -220
  97. package/dist/chunk-ZWEQJIM6.js.map +0 -1
  98. package/dist/run-campaign-ZURVWMMI.js +0 -10
  99. /package/dist/{chunk-QDOSODID.js.map → chunk-3B7Y5AUR.js.map} +0 -0
  100. /package/dist/{chunk-J4DIMSRK.js.map → chunk-6EKXFFGQ.js.map} +0 -0
  101. /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
  102. /package/dist/{chunk-63EPZQUZ.js.map → chunk-6REHLN5J.js.map} +0 -0
  103. /package/dist/{chunk-GM476SZU.js.map → chunk-AIWHLG7J.js.map} +0 -0
  104. /package/dist/{chunk-AIXHUIHG.js.map → chunk-B26KI423.js.map} +0 -0
  105. /package/dist/{chunk-NCK5QLGT.js.map → chunk-F3SRAAZO.js.map} +0 -0
  106. /package/dist/{chunk-VXNVVBZO.js.map → chunk-IHDHUN2X.js.map} +0 -0
  107. /package/dist/{chunk-S3SDD56V.js.map → chunk-ITBRCT73.js.map} +0 -0
  108. /package/dist/{chunk-OLIBRKRD.js.map → chunk-KX6F6NCG.js.map} +0 -0
  109. /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
  110. /package/dist/{chunk-UBPIXOC4.js.map → chunk-SBCB6VZY.js.map} +0 -0
  111. /package/dist/{chunk-YTMXBHFM.js.map → chunk-T375SUOZ.js.map} +0 -0
  112. /package/dist/{chunk-PIEAE33T.js.map → chunk-Z4ZCBC7M.js.map} +0 -0
  113. /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-HXPJAUZ3.js.map} +0 -0
@@ -0,0 +1,113 @@
1
+ import {
2
+ llmSpans
3
+ } from "./chunk-47X6LRCE.js";
4
+
5
+ // src/contamination-guard.ts
6
+ function checkCanaries(output, scenarios) {
7
+ const leaks = [];
8
+ for (const s of scenarios) {
9
+ if (!s.canary) continue;
10
+ if (output.includes(s.canary)) {
11
+ leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt(output, s.canary) });
12
+ }
13
+ }
14
+ return leaks;
15
+ }
16
+ function checkBehavioralCanary(output, scenario) {
17
+ const pattern = scenario.forbiddenPattern ?? scenario.canary;
18
+ if (!pattern) return null;
19
+ const hit = matchForbidden(output, pattern);
20
+ if (!hit) return null;
21
+ return {
22
+ scenarioId: scenario.id,
23
+ canary: pattern,
24
+ evidence: excerpt(output, hit)
25
+ };
26
+ }
27
+ function runBehavioralCanaries(cases) {
28
+ const leaks = [];
29
+ for (const c of cases) {
30
+ const leak = checkBehavioralCanary(c.output, c.scenario);
31
+ if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
32
+ }
33
+ return leaks;
34
+ }
35
+ function matchForbidden(output, pattern) {
36
+ const re = tryParseRegex(pattern);
37
+ if (re) {
38
+ const m = output.match(re);
39
+ return m && m[0].length > 0 ? m[0] : null;
40
+ }
41
+ return output.includes(pattern) ? pattern : null;
42
+ }
43
+ function tryParseRegex(pattern) {
44
+ if (pattern.length < 2 || pattern[0] !== "/") return null;
45
+ const last = pattern.lastIndexOf("/");
46
+ if (last <= 0) return null;
47
+ const body = pattern.slice(1, last);
48
+ const flags = pattern.slice(last + 1);
49
+ if (!/^[gimsuy]*$/.test(flags)) return null;
50
+ try {
51
+ return new RegExp(body, flags);
52
+ } catch {
53
+ return null;
54
+ }
55
+ }
56
+ async function canaryLeakView(store, scenarios) {
57
+ const targets = scenarios.filter((s) => !!s.canary);
58
+ if (targets.length === 0) return [];
59
+ const spans = await llmSpans(store);
60
+ const leaks = [];
61
+ for (const span of spans) {
62
+ const output = span.output ?? "";
63
+ for (const s of targets) {
64
+ if (s.canary && output.includes(s.canary)) {
65
+ leaks.push({
66
+ scenarioId: s.id,
67
+ canary: s.canary,
68
+ runId: span.runId,
69
+ evidence: excerpt(output, s.canary)
70
+ });
71
+ }
72
+ }
73
+ }
74
+ return leaks;
75
+ }
76
+ var HoldoutAuditor = class {
77
+ scenarios;
78
+ accessLog = [];
79
+ constructor(scenarios) {
80
+ this.scenarios = scenarios;
81
+ }
82
+ /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
83
+ get(scenarioId, purpose) {
84
+ if (purpose !== "evaluation" && purpose !== "debugging") {
85
+ throw new Error(
86
+ `HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`
87
+ );
88
+ }
89
+ const s = this.scenarios.find((x) => x.id === scenarioId);
90
+ if (!s) throw new Error(`holdout scenario "${scenarioId}" not found`);
91
+ this.accessLog.push({ scenarioId, purpose, at: Date.now() });
92
+ return s;
93
+ }
94
+ getAccessLog() {
95
+ return this.accessLog;
96
+ }
97
+ };
98
+ function excerpt(source, needle) {
99
+ const at = source.indexOf(needle);
100
+ if (at < 0) return "";
101
+ const start = Math.max(0, at - 30);
102
+ const end = Math.min(source.length, at + needle.length + 30);
103
+ return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
104
+ }
105
+
106
+ export {
107
+ checkCanaries,
108
+ checkBehavioralCanary,
109
+ runBehavioralCanaries,
110
+ canaryLeakView,
111
+ HoldoutAuditor
112
+ };
113
+ //# sourceMappingURL=chunk-SHTXZ4O2.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/contamination-guard.ts"],"sourcesContent":["/**\n * ContaminationGuard — ensures held-out scenarios don't leak into\n * training/prompt paths, and flags model memorization.\n *\n * Three probes:\n * 1. `checkCanaries(output, scenario)` — if the scenario carries a\n * canary token, it MUST NOT appear in the agent's output.\n * Canaries are strings that are statistically impossible to\n * reconstruct from the scenario description alone — so if they\n * echo back, the model memorized them.\n * 2. `canaryLeakView(store)` — cross-corpus view of every run whose\n * output contained a canary, with the offending scenario + run.\n * 3. `HoldoutAuditor` — wraps a Dataset and emits a structured error\n * on any code path that reads holdout scenarios but doesn't flag\n * `purpose: 'evaluation'`. Keeps engineers honest.\n */\n\nimport type { DatasetScenario } from './dataset'\nimport { llmSpans } from './trace/query'\nimport type { TraceStore } from './trace/store'\n\nexport interface CanaryLeak {\n scenarioId: string\n canary: string\n runId?: string\n evidence: string\n}\n\nexport function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[] {\n const leaks: CanaryLeak[] = []\n for (const s of scenarios) {\n if (!s.canary) continue\n if (output.includes(s.canary)) {\n leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt(output, s.canary) })\n }\n }\n return leaks\n}\n\n/**\n * Behavioral canary: tests the AGENT, not the eval grep.\n *\n * - `forbiddenPattern` PRESENT in output ⇒ AGENT EMITTED BAD CONTENT ⇒ LEAK.\n * - `forbiddenPattern` ABSENT from output ⇒ AGENT HELD THE LINE ⇒ PASS.\n *\n * Use when running known-bad-prompt scenarios against the agent under\n * test and you want to know if the agent misbehaved. The classical\n * {@link checkCanaries} / {@link import('./canary').runCanaries | runCanaries}\n * test whether the eval check fires when the bad output is forced\n * into the eval flow — different question, different answer.\n *\n * Pattern resolution order (first match wins):\n * 1. `scenario.forbiddenPattern` — if it parses as `/body/flags`,\n * treated as a regex; otherwise a literal substring.\n * 2. `scenario.canary` — literal substring fallback so the helper\n * works on existing scenario fixtures.\n *\n * Returns `null` when nothing forbidden was found OR the scenario\n * declared no pattern.\n */\nexport function checkBehavioralCanary(\n output: string,\n scenario: DatasetScenario,\n): CanaryLeak | null {\n const pattern = scenario.forbiddenPattern ?? scenario.canary\n if (!pattern) return null\n const hit = matchForbidden(output, pattern)\n if (!hit) return null\n return {\n scenarioId: scenario.id,\n canary: pattern,\n evidence: excerpt(output, hit),\n }\n}\n\n/**\n * Behavioral canary over many (scenario, output) pairs. Sibling to\n * {@link import('./canary').runCanaries | runCanaries} — same idea\n * (run-many → report) but the question being answered is \"did the\n * AGENT misbehave?\" rather than \"did the EVAL grep fire?\".\n *\n * Returns one `CanaryLeak` per pair where the agent's output\n * contained its scenario's `forbiddenPattern` (or `canary` fallback).\n */\nexport function runBehavioralCanaries(\n cases: Array<{ scenario: DatasetScenario; output: string; runId?: string }>,\n): CanaryLeak[] {\n const leaks: CanaryLeak[] = []\n for (const c of cases) {\n const leak = checkBehavioralCanary(c.output, c.scenario)\n if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId })\n }\n return leaks\n}\n\n/**\n * Resolve a forbidden-pattern string to the matched substring inside\n * `output`. `/body/flags` notation is interpreted as a regex; anything\n * else is a literal substring.\n */\nfunction matchForbidden(output: string, pattern: string): string | null {\n const re = tryParseRegex(pattern)\n if (re) {\n const m = output.match(re)\n return m && m[0].length > 0 ? m[0] : null\n }\n return output.includes(pattern) ? pattern : null\n}\n\nfunction tryParseRegex(pattern: string): RegExp | null {\n if (pattern.length < 2 || pattern[0] !== '/') return null\n const last = pattern.lastIndexOf('/')\n if (last <= 0) return null\n const body = pattern.slice(1, last)\n const flags = pattern.slice(last + 1)\n if (!/^[gimsuy]*$/.test(flags)) return null\n try {\n return new RegExp(body, flags)\n } catch {\n return null\n }\n}\n\n/**\n * Scan the LLM-output history in a corpus; returns every case where a\n * canary from a known scenario appeared in agent output. Pass the full\n * set of scenarios whose canaries you care about (typically the whole\n * held-out slice).\n */\nexport async function canaryLeakView(\n store: TraceStore,\n scenarios: DatasetScenario[],\n): Promise<CanaryLeak[]> {\n const targets = scenarios.filter((s) => !!s.canary)\n if (targets.length === 0) return []\n const spans = await llmSpans(store)\n const leaks: CanaryLeak[] = []\n for (const span of spans) {\n const output = span.output ?? ''\n for (const s of targets) {\n if (s.canary && output.includes(s.canary)) {\n leaks.push({\n scenarioId: s.id,\n canary: s.canary,\n runId: span.runId,\n evidence: excerpt(output, s.canary),\n })\n }\n }\n }\n return leaks\n}\n\nexport class HoldoutAuditor {\n private scenarios: DatasetScenario[]\n private accessLog: Array<{ scenarioId: string; purpose: string; at: number }> = []\n\n constructor(scenarios: DatasetScenario[]) {\n this.scenarios = scenarios\n }\n\n /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */\n get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario {\n if (purpose !== 'evaluation' && purpose !== 'debugging') {\n throw new Error(\n `HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`,\n )\n }\n const s = this.scenarios.find((x) => x.id === scenarioId)\n if (!s) throw new Error(`holdout scenario \"${scenarioId}\" not found`)\n this.accessLog.push({ scenarioId, purpose, at: Date.now() })\n return s\n }\n\n getAccessLog(): ReadonlyArray<{ scenarioId: string; purpose: string; at: number }> {\n return this.accessLog\n }\n}\n\nfunction excerpt(source: string, needle: string): string {\n const at = source.indexOf(needle)\n if (at < 0) return ''\n const start = Math.max(0, at - 30)\n const end = Math.min(source.length, at + needle.length + 30)\n return (start > 0 ? '…' : '') + source.slice(start, end) + (end < source.length ? '…' : '')\n}\n"],"mappings":";;;;;AA4BO,SAAS,cAAc,QAAgB,WAA4C;AACxF,QAAM,QAAsB,CAAC;AAC7B,aAAW,KAAK,WAAW;AACzB,QAAI,CAAC,EAAE,OAAQ;AACf,QAAI,OAAO,SAAS,EAAE,MAAM,GAAG;AAC7B,YAAM,KAAK,EAAE,YAAY,EAAE,IAAI,QAAQ,EAAE,QAAQ,UAAU,QAAQ,QAAQ,EAAE,MAAM,EAAE,CAAC;AAAA,IACxF;AAAA,EACF;AACA,SAAO;AACT;AAuBO,SAAS,sBACd,QACA,UACmB;AACnB,QAAM,UAAU,SAAS,oBAAoB,SAAS;AACtD,MAAI,CAAC,QAAS,QAAO;AACrB,QAAM,MAAM,eAAe,QAAQ,OAAO;AAC1C,MAAI,CAAC,IAAK,QAAO;AACjB,SAAO;AAAA,IACL,YAAY,SAAS;AAAA,IACrB,QAAQ;AAAA,IACR,UAAU,QAAQ,QAAQ,GAAG;AAAA,EAC/B;AACF;AAWO,SAAS,sBACd,OACc;AACd,QAAM,QAAsB,CAAC;AAC7B,aAAW,KAAK,OAAO;AACrB,UAAM,OAAO,sBAAsB,EAAE,QAAQ,EAAE,QAAQ;AACvD,QAAI,KAAM,OAAM,KAAK,EAAE,GAAG,MAAM,OAAO,EAAE,SAAS,KAAK,MAAM,CAAC;AAAA,EAChE;AACA,SAAO;AACT;AAOA,SAAS,eAAe,QAAgB,SAAgC;AACtE,QAAM,KAAK,cAAc,OAAO;AAChC,MAAI,IAAI;AACN,UAAM,IAAI,OAAO,MAAM,EAAE;AACzB,WAAO,KAAK,EAAE,CAAC,EAAE,SAAS,IAAI,EAAE,CAAC,IAAI;AAAA,EACvC;AACA,SAAO,OAAO,SAAS,OAAO,IAAI,UAAU;AAC9C;AAEA,SAAS,cAAc,SAAgC;AACrD,MAAI,QAAQ,SAAS,KAAK,QAAQ,CAAC,MAAM,IAAK,QAAO;AACrD,QAAM,OAAO,QAAQ,YAAY,GAAG;AACpC,MAAI,QAAQ,EAAG,QAAO;AACtB,QAAM,OAAO,QAAQ,MAAM,GAAG,IAAI;AAClC,QAAM,QAAQ,QAAQ,MAAM,OAAO,CAAC;AACpC,MAAI,CAAC,cAAc,KAAK,KAAK,EAAG,QAAO;AACvC,MAAI;AACF,WAAO,IAAI,OAAO,MAAM,KAAK;AAAA,EAC/B,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAQA,eAAsB,eACpB,OACA,WACuB;AACvB,QAAM,UAAU,UAAU,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,MAAM;AAClD,MAAI,QAAQ,WAAW,EAAG,QAAO,CAAC;AAClC,QAAM,QAAQ,MAAM,SAAS,KAAK;AAClC,QAAM,QAAsB,CAAC;AAC7B,aAAW,QAAQ,OAAO;AACxB,UAAM,SAAS,KAAK,UAAU;AAC9B,eAAW,KAAK,SAAS;AACvB,UAAI,EAAE,UAAU,OAAO,SAAS,EAAE,MAAM,GAAG;AACzC,cAAM,KAAK;AAAA,UACT,YAAY,EAAE;AAAA,UACd,QAAQ,EAAE;AAAA,UACV,OAAO,KAAK;AAAA,UACZ,UAAU,QAAQ,QAAQ,EAAE,MAAM;AAAA,QACpC,CAAC;AAAA,MACH;AAAA,IACF;AAAA,EACF;AACA,SAAO;AACT;AAEO,IAAM,iBAAN,MAAqB;AAAA,EAClB;AAAA,EACA,YAAwE,CAAC;AAAA,EAEjF,YAAY,WAA8B;AACxC,SAAK,YAAY;AAAA,EACnB;AAAA;AAAA,EAGA,IAAI,YAAoB,SAAsD;AAC5E,QAAI,YAAY,gBAAgB,YAAY,aAAa;AACvD,YAAM,IAAI;AAAA,QACR,wEAAwE,OAAO;AAAA,MACjF;AAAA,IACF;AACA,UAAM,IAAI,KAAK,UAAU,KAAK,CAAC,MAAM,EAAE,OAAO,UAAU;AACxD,QAAI,CAAC,EAAG,OAAM,IAAI,MAAM,qBAAqB,UAAU,aAAa;AACpE,SAAK,UAAU,KAAK,EAAE,YAAY,SAAS,IAAI,KAAK,IAAI,EAAE,CAAC;AAC3D,WAAO;AAAA,EACT;AAAA,EAEA,eAAmF;AACjF,WAAO,KAAK;AAAA,EACd;AACF;AAEA,SAAS,QAAQ,QAAgB,QAAwB;AACvD,QAAM,KAAK,OAAO,QAAQ,MAAM;AAChC,MAAI,KAAK,EAAG,QAAO;AACnB,QAAM,QAAQ,KAAK,IAAI,GAAG,KAAK,EAAE;AACjC,QAAM,MAAM,KAAK,IAAI,OAAO,QAAQ,KAAK,OAAO,SAAS,EAAE;AAC3D,UAAQ,QAAQ,IAAI,WAAM,MAAM,OAAO,MAAM,OAAO,GAAG,KAAK,MAAM,OAAO,SAAS,WAAM;AAC1F;","names":[]}
@@ -1,18 +1,19 @@
1
1
  import {
2
2
  runCampaign
3
- } from "./chunk-74Y2EMNH.js";
3
+ } from "./chunk-OLULBECP.js";
4
4
  import {
5
5
  buildReflectionPrompt,
6
6
  parseReflectionResponse,
7
7
  runCanaries,
8
- scoreRedTeamOutput
9
- } from "./chunk-N4SBKEPJ.js";
8
+ scoreRedTeamOutput,
9
+ summarizeBackendIntegrity
10
+ } from "./chunk-GMXHLSLL.js";
10
11
  import {
11
12
  detectRewardHacking
12
13
  } from "./chunk-YV7J7X5N.js";
13
14
  import {
14
15
  callLlm
15
- } from "./chunk-VXNVVBZO.js";
16
+ } from "./chunk-IHDHUN2X.js";
16
17
 
17
18
  // src/campaign/auto-pr.ts
18
19
  import { execSync } from "child_process";
@@ -174,15 +175,17 @@ function gepaDriver(opts) {
174
175
  );
175
176
  const proposals = parseReflectionResponse(result.content, ctx.populationSize);
176
177
  const out = [];
178
+ const seen = /* @__PURE__ */ new Set();
177
179
  const constraints = opts.constraints;
178
180
  const preserveSections = constraints?.preserveSections !== void 0 ? constraints.preserveSections.length === 0 ? extractH2Sections(parent) : constraints.preserveSections : null;
179
181
  const maxEdits = constraints?.maxSentenceEdits;
180
182
  for (const proposal of proposals) {
181
183
  const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
182
- if (!text || text === parent || out.includes(text)) continue;
184
+ if (!text || text === parent || seen.has(text)) continue;
183
185
  if (preserveSections && !validatePreservedSections(text, preserveSections)) continue;
184
186
  if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) continue;
185
- out.push(text);
187
+ seen.add(text);
188
+ out.push({ surface: text, label: proposal.label, rationale: proposal.rationale });
186
189
  }
187
190
  return out;
188
191
  }
@@ -435,6 +438,19 @@ function meanForScenarios(judgeScoresByCell, scenarioIds) {
435
438
  return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
436
439
  }
437
440
 
441
+ // src/campaign/types.ts
442
+ function isProposedCandidate(value) {
443
+ return typeof value === "object" && value !== null && "surface" in value && "label" in value && "rationale" in value;
444
+ }
445
+ var LABEL_TRUST_RANK = {
446
+ unverified: 0,
447
+ "verified-signal": 1,
448
+ "human-rated": 2
449
+ };
450
+ function labelTrustRank(trust) {
451
+ return LABEL_TRUST_RANK[trust ?? "unverified"];
452
+ }
453
+
438
454
  // src/campaign/presets/run-eval.ts
439
455
  async function runEval(opts) {
440
456
  return runCampaign(opts);
@@ -455,9 +471,11 @@ async function runOptimization(opts) {
455
471
  let winnerSurface = opts.baselineSurface;
456
472
  let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
457
473
  let winnerComposite = meanComposite2(baselineCampaign);
474
+ let winnerLabel;
475
+ let winnerRationale;
458
476
  for (let gen = 0; gen < opts.maxGenerations; gen++) {
459
477
  if (opts.driver.decide?.({ history }).stop) break;
460
- const candidates = await opts.driver.propose({
478
+ const proposed = await opts.driver.propose({
461
479
  currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
462
480
  history,
463
481
  findings: [],
@@ -468,9 +486,12 @@ async function runOptimization(opts) {
468
486
  dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
469
487
  maxImprovementShots: opts.maxImprovementShots
470
488
  });
489
+ const candidates = proposed.map(
490
+ (p) => isProposedCandidate(p) ? p : { surface: p, label: "", rationale: "" }
491
+ );
471
492
  const surfaceResults = [];
472
493
  for (let i = 0; i < candidates.length; i++) {
473
- const surface = candidates[i];
494
+ const { surface, label, rationale } = candidates[i];
474
495
  const hash = surfaceHash(surface);
475
496
  const campaign = await runCampaign({
476
497
  ...opts,
@@ -478,7 +499,7 @@ async function runOptimization(opts) {
478
499
  runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
479
500
  });
480
501
  const composite = meanComposite2(campaign);
481
- surfaceResults.push({ surfaceHash: hash, surface, campaign, composite });
502
+ surfaceResults.push({ surfaceHash: hash, surface, label, rationale, campaign, composite });
482
503
  }
483
504
  surfaceResults.sort((a, b) => b.composite - a.composite);
484
505
  const promoted = surfaceResults.slice(0, promoteTopK);
@@ -488,18 +509,23 @@ async function runOptimization(opts) {
488
509
  winnerSurface = top.surface;
489
510
  winnerSurfaceHash = top.surfaceHash;
490
511
  winnerComposite = top.composite;
512
+ winnerLabel = top.label || void 0;
513
+ winnerRationale = top.rationale || void 0;
491
514
  }
492
515
  const record = {
493
516
  generationIndex: gen,
494
517
  candidates: surfaceResults.map((s) => {
495
518
  const breakdown = candidateBreakdown(s.campaign);
496
- return {
519
+ const candidate = {
497
520
  surfaceHash: s.surfaceHash,
498
521
  composite: s.composite,
499
522
  ci95: [s.composite, s.composite],
500
523
  dimensions: breakdown.dimensions,
501
524
  scenarios: breakdown.scenarios
502
525
  };
526
+ if (s.label) candidate.label = s.label;
527
+ if (s.rationale) candidate.rationale = s.rationale;
528
+ return candidate;
503
529
  }),
504
530
  promoted: promoted.map((p) => p.surfaceHash)
505
531
  };
@@ -517,6 +543,8 @@ async function runOptimization(opts) {
517
543
  generations,
518
544
  winnerSurface,
519
545
  winnerSurfaceHash,
546
+ winnerLabel,
547
+ winnerRationale,
520
548
  baselineCampaign
521
549
  };
522
550
  }
@@ -584,7 +612,7 @@ async function runImprovementLoop(opts) {
584
612
  throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
585
613
  }
586
614
  const optimization = await runOptimization(opts);
587
- const { runCampaign: runCampaign2 } = await import("./run-campaign-ZURVWMMI.js");
615
+ const { runCampaign: runCampaign2 } = await import("./run-campaign-HXPJAUZ3.js");
588
616
  const baselineOnHoldout = await runCampaign2({
589
617
  ...opts,
590
618
  scenarios: opts.holdoutScenarios,
@@ -621,10 +649,10 @@ async function runImprovementLoop(opts) {
621
649
  },
622
650
  signal: new AbortController().signal
623
651
  });
652
+ const render = opts.renderPromotedDiff ?? defaultRenderDiff;
653
+ const promotedDiff = optimization.winnerSurfaceHash === surfaceHash(opts.baselineSurface) ? "" : render(optimization.winnerSurface, opts.baselineSurface);
624
654
  let prResult;
625
655
  if (opts.autoOnPromote === "pr" && gateResult.decision === "ship") {
626
- const render = opts.renderPromotedDiff ?? defaultRenderDiff;
627
- const promotedDiff = render(optimization.winnerSurface, opts.baselineSurface);
628
656
  prResult = openAutoPr({
629
657
  result: winnerOnHoldout,
630
658
  gate: gateResult,
@@ -638,6 +666,7 @@ async function runImprovementLoop(opts) {
638
666
  baselineOnHoldout,
639
667
  winnerOnHoldout,
640
668
  gateResult,
669
+ promotedDiff,
641
670
  prResult
642
671
  };
643
672
  }
@@ -658,6 +687,219 @@ ${fmt(winnerSurface)}`;
658
687
  return lines.join("\n");
659
688
  }
660
689
 
690
+ // src/campaign/provenance.ts
691
+ import { createHash as createHash2 } from "crypto";
692
+ import { join as join2 } from "path";
693
+ function surfaceContentHash(surface) {
694
+ const material = typeof surface === "string" ? surface : JSON.stringify({
695
+ kind: surface.kind,
696
+ worktreeRef: surface.worktreeRef,
697
+ baseRef: surface.baseRef ?? null
698
+ });
699
+ return `sha256:${createHash2("sha256").update(material).digest("hex")}`;
700
+ }
701
+ function meanHoldoutComposite(campaign) {
702
+ const xs = [];
703
+ for (const cell of campaign.cells) {
704
+ if (cell.error) continue;
705
+ const cs = Object.values(cell.judgeScores).map((s) => s.composite);
706
+ if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
707
+ }
708
+ return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
709
+ }
710
+ function buildLoopProvenanceRecord(args) {
711
+ const integrity = summarizeBackendIntegrity(args.workerRecords);
712
+ const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
713
+ const candidates = [];
714
+ for (const gen of args.generations) {
715
+ const promotedSet = new Set(gen.promoted);
716
+ const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
717
+ for (const c of gen.candidates) {
718
+ const surface = surfaceByHash.get(c.surfaceHash);
719
+ const entry = {
720
+ generation: gen.generationIndex,
721
+ surfaceHash: c.surfaceHash,
722
+ contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
723
+ composite: c.composite,
724
+ promoted: promotedSet.has(c.surfaceHash)
725
+ };
726
+ if (c.label) entry.label = c.label;
727
+ if (c.rationale) entry.rationale = c.rationale;
728
+ candidates.push(entry);
729
+ }
730
+ }
731
+ const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
732
+ const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
733
+ const record = {
734
+ schema: "tangle.loop-provenance.v1",
735
+ runId: args.runId,
736
+ runDir: args.runDir,
737
+ timestamp: args.timestamp,
738
+ baselineContentHash: surfaceContentHash(args.baselineSurface),
739
+ winnerContentHash: surfaceContentHash(args.winnerSurface),
740
+ diff: args.diff,
741
+ candidates,
742
+ gate: {
743
+ decision: args.gate.decision,
744
+ reasons: args.gate.reasons,
745
+ delta: args.gate.delta,
746
+ contributingGates: args.gate.contributingGates.map((g) => ({
747
+ name: g.name,
748
+ passed: g.passed
749
+ }))
750
+ },
751
+ baselineHoldoutComposite,
752
+ winnerHoldoutComposite,
753
+ heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
754
+ backend: {
755
+ verdict: integrity.verdict,
756
+ workerCallCount: integrity.totalRecords,
757
+ models,
758
+ totalInputTokens: integrity.totalInputTokens,
759
+ totalOutputTokens: integrity.totalOutputTokens,
760
+ totalCostUsd: integrity.totalCostUsd
761
+ },
762
+ totalCostUsd: args.totalCostUsd,
763
+ totalDurationMs: args.totalDurationMs
764
+ };
765
+ if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
766
+ if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
767
+ return record;
768
+ }
769
+ var DECISION_OK = ["ship"];
770
+ function hashId(parts) {
771
+ return createHash2("sha256").update(parts.join(":")).digest("hex");
772
+ }
773
+ function gateStatus(decision) {
774
+ return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
775
+ }
776
+ function loopProvenanceSpans(record, opts = {}) {
777
+ const traceId = hashId(["trace", record.runId]).slice(0, 32);
778
+ const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
779
+ const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
780
+ const spans = [];
781
+ const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
782
+ spans.push({
783
+ traceId,
784
+ spanId: rootSpanId,
785
+ name: "improvement-loop",
786
+ startTimeUnixNano: baseNano,
787
+ endTimeUnixNano: endNano,
788
+ attributes: {
789
+ "tangle.runId": record.runId,
790
+ "tangle.runDir": record.runDir,
791
+ "tangle.baselineContentHash": record.baselineContentHash,
792
+ "tangle.winnerContentHash": record.winnerContentHash,
793
+ "tangle.heldOutLift": record.heldOutLift,
794
+ "tangle.gateDecision": record.gate.decision,
795
+ "tangle.backendVerdict": record.backend.verdict,
796
+ "tangle.workerCallCount": record.backend.workerCallCount,
797
+ "tangle.totalCostUsd": record.totalCostUsd
798
+ },
799
+ status: gateStatus(record.gate.decision),
800
+ "tangle.runId": record.runId
801
+ });
802
+ const byGen = /* @__PURE__ */ new Map();
803
+ for (const c of record.candidates) {
804
+ const arr = byGen.get(c.generation) ?? [];
805
+ arr.push(c);
806
+ byGen.set(c.generation, arr);
807
+ }
808
+ for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
809
+ const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
810
+ const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
811
+ spans.push({
812
+ traceId,
813
+ spanId: genSpanId,
814
+ parentSpanId: rootSpanId,
815
+ name: `generation-${generation}`,
816
+ startTimeUnixNano: baseNano,
817
+ endTimeUnixNano: endNano,
818
+ attributes: {
819
+ "tangle.runId": record.runId,
820
+ "tangle.generation": generation,
821
+ "tangle.populationSize": cands.length,
822
+ "tangle.bestComposite": bestComposite
823
+ },
824
+ "tangle.runId": record.runId,
825
+ "tangle.generation": generation
826
+ });
827
+ for (let i = 0; i < cands.length; i++) {
828
+ const c = cands[i];
829
+ const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
830
+ 0,
831
+ 16
832
+ );
833
+ const attributes = {
834
+ "tangle.runId": record.runId,
835
+ "tangle.generation": generation,
836
+ "tangle.surfaceHash": c.surfaceHash,
837
+ "tangle.contentHash": c.contentHash,
838
+ "tangle.composite": c.composite,
839
+ "tangle.promoted": c.promoted
840
+ };
841
+ if (c.label) attributes["tangle.candidateLabel"] = c.label;
842
+ if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
843
+ spans.push({
844
+ traceId,
845
+ spanId: candSpanId,
846
+ parentSpanId: genSpanId,
847
+ name: `candidate-${c.surfaceHash}`,
848
+ startTimeUnixNano: baseNano,
849
+ endTimeUnixNano: endNano,
850
+ attributes,
851
+ "tangle.runId": record.runId,
852
+ "tangle.generation": generation
853
+ });
854
+ }
855
+ }
856
+ const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
857
+ spans.push({
858
+ traceId,
859
+ spanId: gateSpanId,
860
+ parentSpanId: rootSpanId,
861
+ name: "gate-decision",
862
+ startTimeUnixNano: endNano,
863
+ endTimeUnixNano: endNano,
864
+ attributes: {
865
+ "tangle.runId": record.runId,
866
+ "tangle.gateDecision": record.gate.decision,
867
+ "tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
868
+ "tangle.gateReasons": JSON.stringify(record.gate.reasons),
869
+ "tangle.heldOutLift": record.heldOutLift,
870
+ "tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
871
+ "tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
872
+ },
873
+ status: gateStatus(record.gate.decision),
874
+ "tangle.runId": record.runId
875
+ });
876
+ return spans;
877
+ }
878
+ function provenanceRecordPath(runDir) {
879
+ return join2(runDir, "loop-provenance.json");
880
+ }
881
+ function provenanceSpansPath(runDir) {
882
+ return join2(runDir, "loop-provenance-spans.jsonl");
883
+ }
884
+ async function emitLoopProvenance(args) {
885
+ const record = buildLoopProvenanceRecord(args);
886
+ const spans = loopProvenanceSpans(record);
887
+ args.storage.ensureDir(args.runDir);
888
+ const recordPath = provenanceRecordPath(args.runDir);
889
+ const spansPath = provenanceSpansPath(args.runDir);
890
+ args.storage.write(recordPath, JSON.stringify(record, null, 2));
891
+ args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
892
+ if (args.hostedClient) {
893
+ try {
894
+ await args.hostedClient.ingestTraces(spans);
895
+ } catch (err) {
896
+ const msg = err instanceof Error ? err.message : String(err);
897
+ console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
898
+ }
899
+ }
900
+ return { record, spans, recordPath, spansPath };
901
+ }
902
+
661
903
  export {
662
904
  openAutoPr,
663
905
  evolutionaryDriver,
@@ -667,9 +909,18 @@ export {
667
909
  composeGate,
668
910
  defaultProductionGate,
669
911
  heldOutGate,
912
+ isProposedCandidate,
913
+ labelTrustRank,
670
914
  runEval,
671
915
  runOptimization,
672
916
  surfaceHash,
673
- runImprovementLoop
917
+ runImprovementLoop,
918
+ defaultRenderDiff,
919
+ surfaceContentHash,
920
+ buildLoopProvenanceRecord,
921
+ loopProvenanceSpans,
922
+ provenanceRecordPath,
923
+ provenanceSpansPath,
924
+ emitLoopProvenance
674
925
  };
675
- //# sourceMappingURL=chunk-JB4UWIM6.js.map
926
+ //# sourceMappingURL=chunk-SUGME4OT.js.map