npm - @tangle-network/agent-eval - Versions diffs - 0.20.12 → 0.22.0 - Mend

@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/CHANGELOG.md +177 -0
package/README.md +43 -1
package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
package/dist/chunk-4W4NCYM2.js.map +1 -0
package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
package/dist/chunk-5IIQKMD5.js.map +1 -0
package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
package/dist/chunk-6M774GY6.js +53 -0
package/dist/chunk-6M774GY6.js.map +1 -0
package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
package/dist/chunk-IOXMGMHQ.js +1226 -0
package/dist/chunk-IOXMGMHQ.js.map +1 -0
package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
package/dist/chunk-KAO3Q65R.js.map +1 -0
package/dist/chunk-QUKKGHTZ.js +121 -0
package/dist/chunk-QUKKGHTZ.js.map +1 -0
package/dist/chunk-SQQLHODJ.js +163 -0
package/dist/chunk-SQQLHODJ.js.map +1 -0
package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
package/dist/chunk-UAND2LOT.js.map +1 -0
package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
package/dist/chunk-USHQBPMH.js.map +1 -0
package/dist/cli.js +3 -2
package/dist/cli.js.map +1 -1
package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
package/dist/control.d.ts +4 -3
package/dist/control.js +2 -2
package/dist/emitter-B2XqDKFU.d.ts +121 -0
package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
package/dist/index.d.ts +16 -302
package/dist/index.js +70 -62
package/dist/index.js.map +1 -1
package/dist/integrity-K2oVlF57.d.ts +210 -0
package/dist/openapi.json +1 -1
package/dist/optimization-UVDNKaO6.d.ts +574 -0
package/dist/optimization.d.ts +7 -144
package/dist/optimization.js +9 -2
package/dist/reporting-B82RSv9C.d.ts +593 -0
package/dist/reporting.d.ts +5 -426
package/dist/reporting.js +17 -6
package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
package/dist/traces.d.ts +179 -3
package/dist/traces.js +35 -4
package/dist/wire/index.js +3 -2
package/docs/research-report-methodology.md +170 -0
package/docs/wire-protocol.md +1 -1
package/package.json +11 -13
package/dist/chunk-75MCTH7P.js.map +0 -1
package/dist/chunk-HKYRWNHV.js.map +0 -1
package/dist/chunk-IKFVX537.js.map +0 -1
package/dist/chunk-KWUAAIHR.js.map +0 -1
package/dist/chunk-ODFINDLQ.js +0 -413
package/dist/chunk-ODFINDLQ.js.map +0 -1
package/dist/chunk-PKCVBYTQ.js.map +0 -1
/package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
/package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0

package/dist/index.js CHANGED Viewed

@@ -19,7 +19,7 @@ import {
   stopOnNoProgress,
   stopOnRepeatedAction,
   subjectiveEval
-} from "./chunk-MCMV7DUL.js";
+} from "./chunk-ARZ6BEV6.js";
 import {
   CallbackResearcher,
   DEFAULT_MUTATION_PRIMITIVES,
@@ -46,6 +46,7 @@ import {
   renderPreferenceMemoryMarkdown,
   replayFeedbackTrajectories,
   replayFeedbackTrajectory,
+  runEvalCampaign,
   runMultiShotOptimization,
   runPromptEvolution,
   scalarScore,
@@ -53,7 +54,7 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-HKYRWNHV.js";
+} from "./chunk-USHQBPMH.js";
 import {
   RunRecordValidationError,
   isRunRecord,
@@ -64,31 +65,36 @@ import {
 import {
   assertReleaseConfidence,
   bootstrapCi,
+  evaluateInterimReleaseConfidence,
   evaluateReleaseConfidence,
-  gainHistogram,
   judgeReplayGate,
-  paretoChart,
+  pairedEvalueSequence,
   releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport,
-  summaryTable
-} from "./chunk-IKFVX537.js";
+  rubricPredictiveValidity
+} from "./chunk-UAND2LOT.js";
 import {
+  RESEARCH_REPORT_HARD_PAIR_FLOOR,
   benjaminiHochberg,
   bhAdjust,
   bonferroni,
   cohensD,
   confidenceInterval,
+  gainHistogram,
   interRaterReliability,
   mannWhitneyU,
   normalizeScores,
   pairedBootstrap,
   pairedTTest,
   pairedWilcoxon,
+  paretoChart,
   partialCredit,
   requiredSampleSize,
+  researchReport,
+  summaryTable,
   weightedMean,
   wilcoxonSignedRank
-} from "./chunk-ODFINDLQ.js";
+} from "./chunk-IOXMGMHQ.js";
 import {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -98,6 +104,8 @@ import {
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
   REDACTION_VERSION,
+  ReplayCache,
+  ReplayCacheMissError,
   SpanNotFoundError,
   TRACE_ANALYST_ACTOR_DESCRIPTION,
   TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
@@ -112,6 +120,7 @@ import {
   buildTraceAnalystTools,
   buildTraceInsightContext,
   buildTraceInsightPrompt,
+  createReplayFetch,
   defaultTraceInsightPanel,
   describeTraceInsightScope,
   domainEvidencePattern,
@@ -123,6 +132,7 @@ import {
   isRetrievalSpan,
   isSandboxSpan,
   isToolSpan,
+  iterateRawCalls,
   judgeSpans,
   llmSpans,
   planTraceInsightQuestions,
@@ -133,20 +143,42 @@ import {
   scoreTraceInsightReadiness,
   tokenizeDomainWords,
   toolSpans,
-  traceAnalystFunctionGroup
-} from "./chunk-KWUAAIHR.js";
+  traceAnalystFunctionGroup,
+  traceAnalystOnRunComplete
+} from "./chunk-4W4NCYM2.js";
+import {
+  RunIntegrityError,
+  assertRunCaptured,
+  throwIfRunIncomplete
+} from "./chunk-QUKKGHTZ.js";
 import {
   TraceEmitter,
   llmSpanFromProvider
-} from "./chunk-PKCVBYTQ.js";
+} from "./chunk-5IIQKMD5.js";
+import {
+  canonicalize,
+  evaluateHypothesis,
+  hashJson,
+  signManifest,
+  verifyManifest
+} from "./chunk-6M774GY6.js";
 import {
   LlmCallError,
   LlmClient,
+  LlmRouteAssertionError,
+  assertLlmRoute,
   callLlm,
   callLlmJson,
   probeLlm,
   stripFencedJson
-} from "./chunk-75MCTH7P.js";
+} from "./chunk-KAO3Q65R.js";
+import {
+  FileSystemRawProviderSink,
+  InMemoryRawProviderSink,
+  NoopRawProviderSink,
+  defaultProviderRedactor,
+  providerFromBaseUrl
+} from "./chunk-SQQLHODJ.js";
 import "./chunk-PZ5AY32C.js";
 // src/client.ts
@@ -4847,7 +4879,7 @@ var Dataset = class _Dataset {
    * Write to disk for contamination-verifiable archives.
    */
   toJsonl() {
-    return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
+    return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
   }
   static fromJsonl(jsonl, manifest) {
     const scenarios = [];
@@ -4860,18 +4892,18 @@ var Dataset = class _Dataset {
   }
 };
 async function hashScenarios(scenarios) {
-  const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
+  const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
   const text = JSON.stringify(canonical);
   const bytes = new TextEncoder().encode(text);
   const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
   return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
 }
-function canonicalize(v) {
+function canonicalize2(v) {
   if (v === null || typeof v !== "object") return v;
-  if (Array.isArray(v)) return v.map(canonicalize);
+  if (Array.isArray(v)) return v.map(canonicalize2);
   const keys = Object.keys(v).sort();
   const out = {};
-  for (const k of keys) out[k] = canonicalize(v[k]);
+  for (const k of keys) out[k] = canonicalize2(v[k]);
   return out;
 }
 function seededShuffle(items, seed) {
@@ -6978,51 +7010,6 @@ function attributeStep(op, prmA, prmB) {
   };
 }
-// src/pre-registration.ts
-function canonicalize2(v) {
-  if (v === null || typeof v !== "object") return v;
-  if (Array.isArray(v)) return v.map(canonicalize2);
-  const keys = Object.keys(v).sort();
-  const out = {};
-  for (const k of keys) out[k] = canonicalize2(v[k]);
-  return out;
-}
-async function hashJson(obj) {
-  const canonical = canonicalize2(obj);
-  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
-  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
-  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
-}
-async function signManifest(m) {
-  const hash = await hashJson(m);
-  return { ...m, contentHash: hash, algo: "sha256-content" };
-}
-async function verifyManifest(m) {
-  const { contentHash, algo: _algo, ...rest } = m;
-  void _algo;
-  const resigned = await signManifest(rest);
-  return resigned.contentHash === contentHash;
-}
-async function evaluateHypothesis(manifest, observed) {
-  if (!await verifyManifest(manifest)) {
-    throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
-  }
-  const reasons = [];
-  const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
-  if (!directionOk) reasons.push("wrong_direction");
-  if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
-  if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
-  if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
-  return {
-    manifest,
-    observedN: observed.n,
-    observedEffect: observed.effect,
-    observedPValue: observed.pValue,
-    confirmed: reasons.length === 0,
-    rejectionReasons: reasons
-  };
-}
 // src/self-play.ts
 async function runSelfPlay(proposer, scorer, targets, options = {}) {
   if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
@@ -10481,6 +10468,7 @@ export {
   FileSystemExperimentStore,
   FileSystemFeedbackTrajectoryStore,
   FileSystemOutcomeStore,
+  FileSystemRawProviderSink,
   FileSystemTraceStore,
   HeldOutGate,
   HoldoutAuditor,
@@ -10489,6 +10477,7 @@ export {
   InMemoryExperimentStore,
   InMemoryFeedbackTrajectoryStore,
   InMemoryOutcomeStore,
+  InMemoryRawProviderSink,
   InMemoryTraceStore,
   InMemoryTrialCache,
   InMemoryWorkspaceInspector,
@@ -10497,12 +10486,14 @@ export {
   LineageRecorder,
   LlmCallError,
   LlmClient,
+  LlmRouteAssertionError,
   LockedJsonlAppender,
   MODEL_PRICING,
   MetricsCollector,
   MultiLayerVerifier,
   MutationTelemetry,
   Mutex,
+  NoopRawProviderSink,
   NoopResearcher,
   OTEL_AGENT_EVAL_SCOPE,
   OtlpFileTraceStore,
@@ -10512,7 +10503,11 @@ export {
   ProjectRegistry,
   PromptRegistry,
   REDACTION_VERSION,
+  RESEARCH_REPORT_HARD_PAIR_FLOOR,
+  ReplayCache,
+  ReplayCacheMissError,
   RunCritic,
+  RunIntegrityError,
   RunRecordValidationError,
   SEMANTIC_CONCEPT_JUDGE_VERSION,
   SandboxHarness,
@@ -10539,7 +10534,9 @@ export {
   analyzeSeries,
   analyzeTraces,
   argHash,
+  assertLlmRoute,
   assertReleaseConfidence,
+  assertRunCaptured,
   assignFeedbackSplit,
   attributeCounterfactuals,
   deterministicSplit as benchmarkDeterministicSplit,
@@ -10563,7 +10560,7 @@ export {
   callLlm,
   callLlmJson,
   canaryLeakView,
-  canonicalize2 as canonicalize,
+  canonicalize,
   causalAttribution,
   checkBehavioralCanary,
   checkCanaries,
@@ -10597,6 +10594,7 @@ export {
   createFeedbackTrajectory,
   createIntentMatchJudge,
   createLlmReviewer,
+  createReplayFetch,
   createSandboxCodeMutator,
   createSandboxPool,
   createSemanticConceptJudge,
@@ -10606,6 +10604,7 @@ export {
   decideReferenceReplayRunPromotion,
   defaultJudges,
   defaultMultiShotObjectives,
+  defaultProviderRedactor,
   defaultReferenceReplayMatcher,
   defaultTraceInsightPanel,
   deployGateLayer,
@@ -10619,6 +10618,7 @@ export {
   evaluateActionPolicy,
   evaluateContract,
   evaluateHypothesis,
+  evaluateInterimReleaseConfidence,
   evaluateOracles,
   evaluateReleaseConfidence,
   executeScenario,
@@ -10670,6 +10670,7 @@ export {
   isRunRecord,
   isSandboxSpan,
   isToolSpan,
+  iterateRawCalls,
   jestTestParser,
   jsonHasKeys,
   jsonShape,
@@ -10698,6 +10699,7 @@ export {
   objectiveEval,
   outputLengthRubric,
   pairedBootstrap,
+  pairedEvalueSequence,
   pairedTTest,
   pairedWilcoxon,
   paraphraseRobustness,
@@ -10720,6 +10722,7 @@ export {
   probeLlm,
   promptBisect,
   proposeSynthesisTargets,
+  providerFromBaseUrl,
   pytestTestParser,
   redTeamDataset,
   redTeamReport,
@@ -10742,17 +10745,20 @@ export {
   replayScorerOverCorpus,
   replayTraceThroughJudge,
   requiredSampleSize,
+  researchReport,
   resetLockedAppendersForTesting,
   resumeBuilderSession,
   roundTripRunRecord,
   rowCount,
   rowWhere,
+  rubricPredictiveValidity,
   runAgentControlLoop,
   runAssertions,
   runBehavioralCanaries,
   runCanaries,
   runCounterfactual,
   runE2EWorkflow,
+  runEvalCampaign,
   runExpectations,
   runFailureClass,
   runHarnessExperiment,
@@ -10799,6 +10805,7 @@ export {
   summaryTable,
   testJudge,
   textInSnapshot,
+  throwIfRunIncomplete,
   toLangfuseEnvelope,
   toNdjson,
   toPrometheusText,
@@ -10810,6 +10817,7 @@ export {
   toolSuccessRubric,
   toolWasteView,
   traceAnalystFunctionGroup,
+  traceAnalystOnRunComplete,
   trialTraceFromMultiShotTrial,
   typoMutator,
   urlContains,