npm - @tangle-network/agent-eval - Versions diffs - 0.33.0 → 0.33.1 - Mend

@tangle-network/agent-eval 0.33.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/dist/benchmarks/index.d.ts +2 -2
package/dist/chunk-DCZXFOQN.js +489 -0
package/dist/chunk-DCZXFOQN.js.map +1 -0
package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
package/dist/chunk-FT3IAMQR.js.map +1 -0
package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
package/dist/chunk-SQYRO3BT.js.map +1 -0
package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
package/dist/chunk-TQL7BAOY.js.map +1 -0
package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
package/dist/chunk-VXNVVBZO.js.map +1 -0
package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
package/dist/cli.js +2 -2
package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +3 -2
package/dist/governance/index.d.ts +2 -1
package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
package/dist/index.d.ts +18 -486
package/dist/index.js +45 -46
package/dist/index.js.map +1 -1
package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
package/dist/meta-eval/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +3 -3
package/dist/optimization.js +6 -6
package/dist/pipelines/index.js +2 -2
package/dist/release-report-ChfmCmLi.d.ts +713 -0
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +10 -9
package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
package/dist/rl.d.ts +5 -5
package/dist/rl.js +6 -6
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
package/dist/wire/index.js +2 -2
package/docs/product-eval-adoption.md +18 -0
package/package.json +1 -1
package/dist/chunk-B73G44OH.js.map +0 -1
package/dist/chunk-CXJOVDJR.js.map +0 -1
package/dist/chunk-DTEJNZYK.js.map +0 -1
package/dist/chunk-M6RZ5LJN.js.map +0 -1
package/dist/chunk-ZN2CMQIW.js +0 -208
package/dist/chunk-ZN2CMQIW.js.map +0 -1
package/dist/release-report-DLWbBPtH.d.ts +0 -292
/package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
/package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
/package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
/package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0

package/dist/index.js CHANGED Viewed

@@ -11,7 +11,7 @@ import {
   failureClusterView,
   iqr,
   welchsTTest
-} from "./chunk-GVQT44CS.js";
+} from "./chunk-KE7TDJUO.js";
 import {
   exportTrainingData,
   toNdjson
@@ -54,7 +54,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-S4Y5VXMS.js";
+} from "./chunk-WRGHMGWT.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -96,14 +96,7 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-B73G44OH.js";
-import {
-  RunRecordValidationError,
-  isRunRecord,
-  parseRunRecordSafe,
-  roundTripRunRecord,
-  validateRunRecord
-} from "./chunk-ZN2CMQIW.js";
+} from "./chunk-FT3IAMQR.js";
 import {
   assertReleaseConfidence,
   bootstrapCi,
@@ -111,38 +104,52 @@ import {
   judgeReplayGate,
   releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport
-} from "./chunk-WGXZAQLR.js";
+} from "./chunk-LGAPK7NA.js";
 import {
   runEvalCampaign
-} from "./chunk-DTEJNZYK.js";
+} from "./chunk-SQYRO3BT.js";
 import {
   LlmCallError,
   LlmClient,
   LlmRouteAssertionError,
   assertLlmRoute,
+  backoffMs,
   callLlm,
   callLlmJson,
+  isTransientLlmError,
   probeLlm,
   stripFencedJson
-} from "./chunk-M6RZ5LJN.js";
+} from "./chunk-VXNVVBZO.js";
+import {
+  AgentProfileCellValidationError,
+  RunRecordValidationError,
+  agentProfileCellHashMaterial,
+  agentProfileCellKey,
+  assertRunAgentProfileCell,
+  buildAgentProfileCell,
+  groupRunsByAgentProfileCell,
+  isRunRecord,
+  parseRunRecordSafe,
+  requireAgentProfileCell,
+  roundTripRunRecord,
+  validateAgentProfileCell,
+  validateRunRecord,
+  verifyAgentProfileCell
+} from "./chunk-DCZXFOQN.js";
 import {
   evaluateInterimReleaseConfidence,
   pairedEvalueSequence
 } from "./chunk-MAZ26DC7.js";
 import {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
-  benjaminiHochberg,
-  bhAdjust,
-  bonferroni,
   gainHistogram,
-  pairedBootstrap,
-  pairedWilcoxon,
   paretoChart,
-  requiredSampleSize,
   researchReport,
   summaryTable
-} from "./chunk-CXJOVDJR.js";
+} from "./chunk-TQL7BAOY.js";
 import {
+  benjaminiHochberg,
+  bonferroni,
   calibrateJudge,
   calibrateJudgeContinuous,
   cohensD,
@@ -153,14 +160,17 @@ import {
   interRaterReliability,
   mannWhitneyU,
   normalizeScores,
+  pairedBootstrap,
+  pairedMde,
   pairedTTest,
   partialCredit,
   positionalBias,
+  requiredSampleSize,
   selfPreference,
   verbosityBias,
   weightedMean,
   wilcoxonSignedRank
-} from "./chunk-4L3WJXQJ.js";
+} from "./chunk-KHZRNY3F.js";
 import {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -10316,35 +10326,14 @@ var JsonlTrialCache = class {
 // src/judge-retry.ts
 var DEFAULT_MAX_ATTEMPTS = 3;
 var DEFAULT_TIMEOUT_MS = 9e4;
-var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
-var ABORT_PATTERNS = [
-  /AbortError/i,
-  /TimeoutError/i,
-  /fetch failed/i,
-  /ECONNRESET/i,
-  /ETIMEDOUT/i,
-  /EAI_AGAIN/i,
-  /this operation was aborted/i,
-  /stream.*ended.*unexpectedly/i,
-  /socket hang up/i
-];
-var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
-function defaultIsRetryable(err) {
-  if (err instanceof Error) {
-    if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
-    const status = err.status;
-    if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
-  }
-  return false;
-}
 function sleep(ms) {
   return new Promise((resolve) => setTimeout(resolve, ms));
 }
 async function withJudgeRetry(judgeFn, policy = {}) {
   const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
   const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
-  const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
-  const isRetryable = policy.isRetryable ?? defaultIsRetryable;
+  const backoff = policy.backoffMs ?? backoffMs;
+  const isRetryable = policy.isRetryable ?? isTransientLlmError;
   const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
   let totalAttempts = 0;
   const attemptErrors = [];
@@ -10667,6 +10656,7 @@ export {
   ANALYST_SEVERITIES,
   AgentDriver,
   AgentEvalError,
+  AgentProfileCellValidationError,
   AnalystRegistry,
   AxGepaSteeringOptimizer,
   BENCHMARK_SPLIT_SEED,
@@ -10774,6 +10764,8 @@ export {
   VerificationError,
   acquisitionPlansForKnowledgeGaps,
   adversarialJudge,
+  agentProfileCellHashMaterial,
+  agentProfileCellKey,
   aggregateLlm,
   aggregateRunScore,
   aggregateTrialsByMode,
@@ -10785,17 +10777,19 @@ export {
   assertLlmRoute,
   assertRealBackend,
   assertReleaseConfidence,
+  assertRunAgentProfileCell,
   assertRunCaptured,
   assignFeedbackSplit,
   attributeCounterfactuals,
+  backoffMs,
   deterministicSplit as benchmarkDeterministicSplit,
   benchmarks_exports as benchmarks,
   benjaminiHochberg,
-  bhAdjust,
   bisect,
   blockingKnowledgeEval,
   bonferroni,
   bootstrapCi,
+  buildAgentProfileCell,
   buildDriverSystemPrompt,
   buildReflectionPrompt,
   buildReviewerPrompt,
@@ -10912,6 +10906,7 @@ export {
   precision as goldenPrecision,
   gradeSemanticStatus,
   groupBy,
+  groupRunsByAgentProfileCell,
   hashContent,
   hashJson,
   hashScenarios,
@@ -10933,6 +10928,7 @@ export {
   isRunRecord,
   isSandboxSpan,
   isToolSpan,
+  isTransientLlmError,
   iterateRawCalls,
   jestTestParser,
   jsonHasKeys,
@@ -10962,8 +10958,8 @@ export {
   objectiveEval,
   pairedBootstrap,
   pairedEvalueSequence,
+  pairedMde,
   pairedTTest,
-  pairedWilcoxon,
   paraphraseRobustness,
   paraphraseRobustnessScenarios,
   paretoChart,
@@ -11009,6 +11005,7 @@ export {
   replayFeedbackTrajectory,
   replayScorerOverCorpus,
   replayTraceThroughJudge,
+  requireAgentProfileCell,
   requiredSampleSize,
   researchReport,
   resetLockedAppendersForTesting,
@@ -11079,8 +11076,10 @@ export {
   typoMutator,
   urlContains,
   userQuestionsForKnowledgeGaps,
+  validateAgentProfileCell,
   validateRunRecord,
   verbosityBias,
+  verifyAgentProfileCell,
   verifyCompletion,
   verifyManifest,
   visualDiff,