@tangle-network/agent-eval 0.49.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/adapters/http.d.ts +1 -1
  2. package/dist/adapters/langchain.d.ts +1 -1
  3. package/dist/adapters/otel.d.ts +8 -2
  4. package/dist/campaign/index.d.ts +3 -3
  5. package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
  6. package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
  7. package/dist/chunk-EGIPWXHL.js.map +1 -0
  8. package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
  9. package/dist/chunk-FQK2CCIM.js.map +1 -0
  10. package/dist/chunk-MAZ26DC7.js +99 -0
  11. package/dist/chunk-MAZ26DC7.js.map +1 -0
  12. package/dist/chunk-SHTXZ4O2.js +113 -0
  13. package/dist/chunk-SHTXZ4O2.js.map +1 -0
  14. package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
  15. package/dist/contract/index.d.ts +206 -9
  16. package/dist/contract/index.js +751 -3
  17. package/dist/contract/index.js.map +1 -1
  18. package/dist/governance/index.d.ts +1 -1
  19. package/dist/hosted/index.d.ts +8 -192
  20. package/dist/hosted/index.js +1 -1
  21. package/dist/index-BRxz6qov.d.ts +409 -0
  22. package/dist/index.d.ts +18 -462
  23. package/dist/index.js +14 -106
  24. package/dist/index.js.map +1 -1
  25. package/dist/meta-eval/index.d.ts +3 -3
  26. package/dist/openapi.json +1 -1
  27. package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
  28. package/dist/registry-8KAs18kY.d.ts +457 -0
  29. package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
  30. package/dist/reporting.d.ts +6 -4
  31. package/dist/reporting.js +6 -4
  32. package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
  33. package/dist/rl.d.ts +9 -8
  34. package/dist/rl.js +3 -2
  35. package/dist/rl.js.map +1 -1
  36. package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
  37. package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
  38. package/dist/sequential-5iSVfzl2.d.ts +139 -0
  39. package/dist/store-CJbzDxZ2.d.ts +220 -0
  40. package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
  41. package/dist/traces.d.ts +3 -220
  42. package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
  43. package/dist/types-DhqpAi_z.d.ts +296 -0
  44. package/package.json +1 -1
  45. package/dist/chunk-MNL6LXGQ.js.map +0 -1
  46. package/dist/chunk-OYI6RZJK.js.map +0 -1
  47. /package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
  48. /package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0
package/dist/index.js CHANGED
@@ -1,3 +1,10 @@
1
+ import {
2
+ HoldoutAuditor,
3
+ canaryLeakView,
4
+ checkBehavioralCanary,
5
+ checkCanaries,
6
+ runBehavioralCanaries
7
+ } from "./chunk-SHTXZ4O2.js";
1
8
  import {
2
9
  DEFAULT_MUTATION_PRIMITIVES,
3
10
  DEFAULT_RED_TEAM_CORPUS,
@@ -82,10 +89,10 @@ import {
82
89
  evaluateReleaseConfidence,
83
90
  judgeReplayGate,
84
91
  renderReleaseReport
85
- } from "./chunk-KQ26DYTQ.js";
92
+ } from "./chunk-UBQGWD3O.js";
86
93
  import {
87
94
  runEvalCampaign
88
- } from "./chunk-PD3MH6WU.js";
95
+ } from "./chunk-5KSDYBYH.js";
89
96
  import {
90
97
  AGENT_PROFILE_KINDS,
91
98
  AgentProfileCellValidationError,
@@ -106,14 +113,16 @@ import {
106
113
  verifyAgentProfileCell
107
114
  } from "./chunk-BWZEGTES.js";
108
115
  import {
109
- RESEARCH_REPORT_HARD_PAIR_FLOOR,
110
116
  evaluateInterimReleaseConfidence,
117
+ pairedEvalueSequence
118
+ } from "./chunk-MAZ26DC7.js";
119
+ import {
120
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
111
121
  gainHistogram,
112
- pairedEvalueSequence,
113
122
  paretoChart,
114
123
  researchReport,
115
124
  summaryTable
116
- } from "./chunk-MNL6LXGQ.js";
125
+ } from "./chunk-EGIPWXHL.js";
117
126
  import {
118
127
  benjaminiHochberg,
119
128
  bonferroni,
@@ -6938,107 +6947,6 @@ function defaultExtract(metric) {
6938
6947
  };
6939
6948
  }
6940
6949
 
6941
- // src/contamination-guard.ts
6942
- function checkCanaries(output, scenarios) {
6943
- const leaks = [];
6944
- for (const s of scenarios) {
6945
- if (!s.canary) continue;
6946
- if (output.includes(s.canary)) {
6947
- leaks.push({ scenarioId: s.id, canary: s.canary, evidence: excerpt2(output, s.canary) });
6948
- }
6949
- }
6950
- return leaks;
6951
- }
6952
- function checkBehavioralCanary(output, scenario) {
6953
- const pattern = scenario.forbiddenPattern ?? scenario.canary;
6954
- if (!pattern) return null;
6955
- const hit = matchForbidden(output, pattern);
6956
- if (!hit) return null;
6957
- return {
6958
- scenarioId: scenario.id,
6959
- canary: pattern,
6960
- evidence: excerpt2(output, hit)
6961
- };
6962
- }
6963
- function runBehavioralCanaries(cases) {
6964
- const leaks = [];
6965
- for (const c of cases) {
6966
- const leak = checkBehavioralCanary(c.output, c.scenario);
6967
- if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
6968
- }
6969
- return leaks;
6970
- }
6971
- function matchForbidden(output, pattern) {
6972
- const re = tryParseRegex(pattern);
6973
- if (re) {
6974
- const m = output.match(re);
6975
- return m && m[0].length > 0 ? m[0] : null;
6976
- }
6977
- return output.includes(pattern) ? pattern : null;
6978
- }
6979
- function tryParseRegex(pattern) {
6980
- if (pattern.length < 2 || pattern[0] !== "/") return null;
6981
- const last = pattern.lastIndexOf("/");
6982
- if (last <= 0) return null;
6983
- const body = pattern.slice(1, last);
6984
- const flags = pattern.slice(last + 1);
6985
- if (!/^[gimsuy]*$/.test(flags)) return null;
6986
- try {
6987
- return new RegExp(body, flags);
6988
- } catch {
6989
- return null;
6990
- }
6991
- }
6992
- async function canaryLeakView(store, scenarios) {
6993
- const targets = scenarios.filter((s) => !!s.canary);
6994
- if (targets.length === 0) return [];
6995
- const spans = await llmSpans(store);
6996
- const leaks = [];
6997
- for (const span of spans) {
6998
- const output = span.output ?? "";
6999
- for (const s of targets) {
7000
- if (s.canary && output.includes(s.canary)) {
7001
- leaks.push({
7002
- scenarioId: s.id,
7003
- canary: s.canary,
7004
- runId: span.runId,
7005
- evidence: excerpt2(output, s.canary)
7006
- });
7007
- }
7008
- }
7009
- }
7010
- return leaks;
7011
- }
7012
- var HoldoutAuditor = class {
7013
- scenarios;
7014
- accessLog = [];
7015
- constructor(scenarios) {
7016
- this.scenarios = scenarios;
7017
- }
7018
- /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
7019
- get(scenarioId, purpose) {
7020
- if (purpose !== "evaluation" && purpose !== "debugging") {
7021
- throw new Error(
7022
- `HoldoutAuditor.get: purpose must be 'evaluation' or 'debugging', got ${purpose}`
7023
- );
7024
- }
7025
- const s = this.scenarios.find((x) => x.id === scenarioId);
7026
- if (!s) throw new Error(`holdout scenario "${scenarioId}" not found`);
7027
- this.accessLog.push({ scenarioId, purpose, at: Date.now() });
7028
- return s;
7029
- }
7030
- getAccessLog() {
7031
- return this.accessLog;
7032
- }
7033
- };
7034
- function excerpt2(source, needle) {
7035
- const at = source.indexOf(needle);
7036
- if (at < 0) return "";
7037
- const start = Math.max(0, at - 30);
7038
- const end = Math.min(source.length, at + needle.length + 30);
7039
- return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
7040
- }
7041
-
7042
6950
  // src/observability.ts
7043
6951
  async function toLangfuseEnvelope(store, runId) {
7044
6952
  const run = await store.getRun(runId);