npm - @tangle-network/agent-eval - Versions diffs - 0.32.0 → 0.33.1 - Mend

@tangle-network/agent-eval 0.32.0 → 0.33.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/CHANGELOG.md +30 -0
package/dist/benchmarks/index.d.ts +2 -2
package/dist/chunk-DCZXFOQN.js +489 -0
package/dist/chunk-DCZXFOQN.js.map +1 -0
package/dist/{chunk-B73G44OH.js → chunk-FT3IAMQR.js} +5 -5
package/dist/chunk-FT3IAMQR.js.map +1 -0
package/dist/{chunk-GVQT44CS.js → chunk-KE7TDJUO.js} +2 -2
package/dist/{chunk-4L3WJXQJ.js → chunk-KHZRNY3F.js} +163 -2
package/dist/{chunk-4L3WJXQJ.js.map → chunk-KHZRNY3F.js.map} +1 -1
package/dist/{chunk-WGXZAQLR.js → chunk-LGAPK7NA.js} +2 -2
package/dist/{chunk-DTEJNZYK.js → chunk-SQYRO3BT.js} +47 -4
package/dist/chunk-SQYRO3BT.js.map +1 -0
package/dist/{chunk-CXJOVDJR.js → chunk-TQL7BAOY.js} +5 -175
package/dist/chunk-TQL7BAOY.js.map +1 -0
package/dist/{chunk-M6RZ5LJN.js → chunk-VXNVVBZO.js} +34 -5
package/dist/chunk-VXNVVBZO.js.map +1 -0
package/dist/{chunk-S4Y5VXMS.js → chunk-WRGHMGWT.js} +2 -2
package/dist/{chunk-SMSGXM74.js → chunk-YU3G6I7F.js} +2 -2
package/dist/cli.js +2 -2
package/dist/{control-p2ns7elI.d.ts → control-C3k02SCP.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/control.js +3 -2
package/dist/governance/index.d.ts +2 -1
package/dist/{index-DPILdKbP.d.ts → index-CN2agEaO.d.ts} +2 -142
package/dist/{index-BTqhGHJT.d.ts → index-ClMxVqe_.d.ts} +1 -1
package/dist/index.d.ts +39 -486
package/dist/index.js +75 -68
package/dist/index.js.map +1 -1
package/dist/judge-calibration-DilmB3Ml.d.ts +142 -0
package/dist/meta-eval/index.d.ts +2 -2
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +3 -3
package/dist/optimization.js +6 -6
package/dist/pipelines/index.js +2 -2
package/dist/release-report-ChfmCmLi.d.ts +713 -0
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +10 -9
package/dist/{researcher-BRHa5Jxo.d.ts → researcher-CfnL3HEb.d.ts} +34 -3
package/dist/rl.d.ts +5 -5
package/dist/rl.js +6 -6
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CMHypZ_M.d.ts → rubric-predictive-validity-BvaNwfBE.d.ts} +1 -1
package/dist/{run-record-BfX5y68A.d.ts → run-record-YinVdFwu.d.ts} +78 -2
package/dist/{summary-report-D7AQS7eB.d.ts → summary-report-BPJVzIeW.d.ts} +2 -2
package/dist/wire/index.js +2 -2
package/docs/product-eval-adoption.md +18 -0
package/package.json +22 -12
package/dist/chunk-B73G44OH.js.map +0 -1
package/dist/chunk-CXJOVDJR.js.map +0 -1
package/dist/chunk-DTEJNZYK.js.map +0 -1
package/dist/chunk-M6RZ5LJN.js.map +0 -1
package/dist/chunk-ZN2CMQIW.js +0 -208
package/dist/chunk-ZN2CMQIW.js.map +0 -1
package/dist/release-report-DLWbBPtH.d.ts +0 -292
/package/dist/{chunk-GVQT44CS.js.map → chunk-KE7TDJUO.js.map} +0 -0
/package/dist/{chunk-WGXZAQLR.js.map → chunk-LGAPK7NA.js.map} +0 -0
/package/dist/{chunk-S4Y5VXMS.js.map → chunk-WRGHMGWT.js.map} +0 -0
/package/dist/{chunk-SMSGXM74.js.map → chunk-YU3G6I7F.js.map} +0 -0

package/dist/index.js CHANGED Viewed

@@ -11,7 +11,7 @@ import {
   failureClusterView,
   iqr,
   welchsTTest
-} from "./chunk-GVQT44CS.js";
+} from "./chunk-KE7TDJUO.js";
 import {
   exportTrainingData,
   toNdjson
@@ -54,7 +54,7 @@ import {
   runProposeReview,
   runProposeReviewAsControlLoop,
   scoreFromEvals
-} from "./chunk-S4Y5VXMS.js";
+} from "./chunk-WRGHMGWT.js";
 import {
   allCriticalPassed,
   objectiveEval,
@@ -96,14 +96,7 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-B73G44OH.js";
-import {
-  RunRecordValidationError,
-  isRunRecord,
-  parseRunRecordSafe,
-  roundTripRunRecord,
-  validateRunRecord
-} from "./chunk-ZN2CMQIW.js";
+} from "./chunk-FT3IAMQR.js";
 import {
   assertReleaseConfidence,
   bootstrapCi,
@@ -111,38 +104,52 @@ import {
   judgeReplayGate,
   releaseTraceEvidenceFromMultiShotTrials,
   renderReleaseReport
-} from "./chunk-WGXZAQLR.js";
+} from "./chunk-LGAPK7NA.js";
 import {
   runEvalCampaign
-} from "./chunk-DTEJNZYK.js";
+} from "./chunk-SQYRO3BT.js";
 import {
   LlmCallError,
   LlmClient,
   LlmRouteAssertionError,
   assertLlmRoute,
+  backoffMs,
   callLlm,
   callLlmJson,
+  isTransientLlmError,
   probeLlm,
   stripFencedJson
-} from "./chunk-M6RZ5LJN.js";
+} from "./chunk-VXNVVBZO.js";
+import {
+  AgentProfileCellValidationError,
+  RunRecordValidationError,
+  agentProfileCellHashMaterial,
+  agentProfileCellKey,
+  assertRunAgentProfileCell,
+  buildAgentProfileCell,
+  groupRunsByAgentProfileCell,
+  isRunRecord,
+  parseRunRecordSafe,
+  requireAgentProfileCell,
+  roundTripRunRecord,
+  validateAgentProfileCell,
+  validateRunRecord,
+  verifyAgentProfileCell
+} from "./chunk-DCZXFOQN.js";
 import {
   evaluateInterimReleaseConfidence,
   pairedEvalueSequence
 } from "./chunk-MAZ26DC7.js";
 import {
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
-  benjaminiHochberg,
-  bhAdjust,
-  bonferroni,
   gainHistogram,
-  pairedBootstrap,
-  pairedWilcoxon,
   paretoChart,
-  requiredSampleSize,
   researchReport,
   summaryTable
-} from "./chunk-CXJOVDJR.js";
+} from "./chunk-TQL7BAOY.js";
 import {
+  benjaminiHochberg,
+  bonferroni,
   calibrateJudge,
   calibrateJudgeContinuous,
   cohensD,
@@ -153,14 +160,17 @@ import {
   interRaterReliability,
   mannWhitneyU,
   normalizeScores,
+  pairedBootstrap,
+  pairedMde,
   pairedTTest,
   partialCredit,
   positionalBias,
+  requiredSampleSize,
   selfPreference,
   verbosityBias,
   weightedMean,
   wilcoxonSignedRank
-} from "./chunk-4L3WJXQJ.js";
+} from "./chunk-KHZRNY3F.js";
 import {
   DEFAULT_REDACTION_RULES,
   DEFAULT_TRACE_ANALYST_BUDGETS,
@@ -2991,29 +3001,13 @@ var AgentDriver = class {
   }
   /** Use the driver LLM to decide what the "user" says next */
   async decideNextMessage(persona, state, history) {
-    const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
-    const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
-    const resp = await this.tc.chat({
-      model: this.driverModel,
-      messages: [
-        {
-          role: "system",
-          content: buildDriverSystemPrompt(persona, state, this.productContext)
-        },
-        {
-          role: "user",
-          content: recentHistory ? `Recent conversation:
-${recentHistory}
-The agent's latest response:
-${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
-        }
-      ],
-      temperature: 0.5,
-      maxTokens: 700
+    return decideNextUserTurn(this.tc, {
+      persona,
+      state,
+      history,
+      productContext: this.productContext,
+      model: this.driverModel
     });
-    const content = resp.choices?.[0]?.message?.content ?? "";
-    return content.trim();
   }
   /** Handle pending approvals based on persona feedback patterns */
   async handleApprovals(persona, workspaceId, _state) {
@@ -3081,6 +3075,29 @@ Sign-off: respond with exactly "DONE" only when a ${persona.role} would act on t
 Output ONLY your next message to the agent \u2014 in character, first person, no meta-commentary, no stage directions.`;
 }
+async function decideNextUserTurn(tc, opts) {
+  const { persona, state, history, productContext = "", model = "claude-sonnet-4-6" } = opts;
+  const lastResponse = history.length > 0 ? history[history.length - 1].content.slice(0, 2e3) : "(no conversation yet \u2014 this is the first message)";
+  const recentHistory = history.slice(-6).map((h) => `${h.role}: ${h.content.slice(0, 500)}`).join("\n\n");
+  const resp = await tc.chat({
+    model,
+    messages: [
+      { role: "system", content: buildDriverSystemPrompt(persona, state, productContext) },
+      {
+        role: "user",
+        content: recentHistory ? `Recent conversation:
+${recentHistory}
+The agent's latest response:
+${lastResponse}` : "No conversation yet. Send your opening message \u2014 in character, phrased as this person actually would."
+      }
+    ],
+    temperature: 0.5,
+    maxTokens: 700
+  });
+  const content = resp.choices?.[0]?.message?.content ?? "";
+  return content.trim();
+}
 // src/integration-gates.ts
 function integrationManifestValidatedPayload(input) {
@@ -10309,35 +10326,14 @@ var JsonlTrialCache = class {
 // src/judge-retry.ts
 var DEFAULT_MAX_ATTEMPTS = 3;
 var DEFAULT_TIMEOUT_MS = 9e4;
-var DEFAULT_BACKOFF = (attempt) => Math.min(500 * 2 ** attempt, 16e3);
-var ABORT_PATTERNS = [
-  /AbortError/i,
-  /TimeoutError/i,
-  /fetch failed/i,
-  /ECONNRESET/i,
-  /ETIMEDOUT/i,
-  /EAI_AGAIN/i,
-  /this operation was aborted/i,
-  /stream.*ended.*unexpectedly/i,
-  /socket hang up/i
-];
-var RETRYABLE_HTTP_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
-function defaultIsRetryable(err) {
-  if (err instanceof Error) {
-    if (ABORT_PATTERNS.some((p) => p.test(err.message) || p.test(err.name))) return true;
-    const status = err.status;
-    if (typeof status === "number" && RETRYABLE_HTTP_STATUS.has(status)) return true;
-  }
-  return false;
-}
 function sleep(ms) {
   return new Promise((resolve) => setTimeout(resolve, ms));
 }
 async function withJudgeRetry(judgeFn, policy = {}) {
   const maxAttempts = policy.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
   const timeoutMs = policy.timeoutMs ?? DEFAULT_TIMEOUT_MS;
-  const backoff = policy.backoffMs ?? DEFAULT_BACKOFF;
-  const isRetryable = policy.isRetryable ?? defaultIsRetryable;
+  const backoff = policy.backoffMs ?? backoffMs;
+  const isRetryable = policy.isRetryable ?? isTransientLlmError;
   const models = policy.models && policy.models.length > 0 ? policy.models : [void 0];
   let totalAttempts = 0;
   const attemptErrors = [];
@@ -10660,6 +10656,7 @@ export {
   ANALYST_SEVERITIES,
   AgentDriver,
   AgentEvalError,
+  AgentProfileCellValidationError,
   AnalystRegistry,
   AxGepaSteeringOptimizer,
   BENCHMARK_SPLIT_SEED,
@@ -10767,6 +10764,8 @@ export {
   VerificationError,
   acquisitionPlansForKnowledgeGaps,
   adversarialJudge,
+  agentProfileCellHashMaterial,
+  agentProfileCellKey,
   aggregateLlm,
   aggregateRunScore,
   aggregateTrialsByMode,
@@ -10778,17 +10777,19 @@ export {
   assertLlmRoute,
   assertRealBackend,
   assertReleaseConfidence,
+  assertRunAgentProfileCell,
   assertRunCaptured,
   assignFeedbackSplit,
   attributeCounterfactuals,
+  backoffMs,
   deterministicSplit as benchmarkDeterministicSplit,
   benchmarks_exports as benchmarks,
   benjaminiHochberg,
-  bhAdjust,
   bisect,
   blockingKnowledgeEval,
   bonferroni,
   bootstrapCi,
+  buildAgentProfileCell,
   buildDriverSystemPrompt,
   buildReflectionPrompt,
   buildReviewerPrompt,
@@ -10853,6 +10854,7 @@ export {
   createVerifierAdapter,
   crossTraceDiff,
   crowdingDistance,
+  decideNextUserTurn,
   decideReferenceReplayPromotion,
   decideReferenceReplayRunPromotion,
   defaultIsMaterial,
@@ -10904,6 +10906,7 @@ export {
   precision as goldenPrecision,
   gradeSemanticStatus,
   groupBy,
+  groupRunsByAgentProfileCell,
   hashContent,
   hashJson,
   hashScenarios,
@@ -10925,6 +10928,7 @@ export {
   isRunRecord,
   isSandboxSpan,
   isToolSpan,
+  isTransientLlmError,
   iterateRawCalls,
   jestTestParser,
   jsonHasKeys,
@@ -10954,8 +10958,8 @@ export {
   objectiveEval,
   pairedBootstrap,
   pairedEvalueSequence,
+  pairedMde,
   pairedTTest,
-  pairedWilcoxon,
   paraphraseRobustness,
   paraphraseRobustnessScenarios,
   paretoChart,
@@ -11001,6 +11005,7 @@ export {
   replayFeedbackTrajectory,
   replayScorerOverCorpus,
   replayTraceThroughJudge,
+  requireAgentProfileCell,
   requiredSampleSize,
   researchReport,
   resetLockedAppendersForTesting,
@@ -11071,8 +11076,10 @@ export {
   typoMutator,
   urlContains,
   userQuestionsForKnowledgeGaps,
+  validateAgentProfileCell,
   validateRunRecord,
   verbosityBias,
+  verifyAgentProfileCell,
   verifyCompletion,
   verifyManifest,
   visualDiff,