@tangle-network/agent-eval 0.30.0 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/{baseline-BwdCXUS8.d.ts → baseline-4R5deP0N.d.ts} +1 -1
  2. package/dist/benchmarks/index.d.ts +3 -3
  3. package/dist/builder-eval/index.d.ts +3 -3
  4. package/dist/builder-eval/index.js +2 -2
  5. package/dist/{chunk-R5UQJNKC.js → chunk-4L3WJXQJ.js} +2 -2
  6. package/dist/{chunk-RUI6SIHY.js → chunk-75ZREHD7.js} +4 -4
  7. package/dist/{chunk-5AKPEK5L.js → chunk-CXJOVDJR.js} +2 -2
  8. package/dist/{chunk-K33INZHH.js → chunk-GVQT44CS.js} +2 -2
  9. package/dist/{chunk-UW4NOOZI.js → chunk-HIO4UIS5.js} +308 -2
  10. package/dist/chunk-HIO4UIS5.js.map +1 -0
  11. package/dist/{chunk-4S4BM3QQ.js → chunk-M6RZ5LJN.js} +2 -2
  12. package/dist/{chunk-NG236HPC.js → chunk-QYJT52YW.js} +1 -1
  13. package/dist/chunk-QYJT52YW.js.map +1 -0
  14. package/dist/{chunk-XFZCM5Z3.js → chunk-SMSGXM74.js} +2 -2
  15. package/dist/{chunk-KTGTIOFD.js → chunk-UBPIXOC4.js} +2 -2
  16. package/dist/{chunk-DBIGN5MJ.js → chunk-WGXZAQLR.js} +3 -3
  17. package/dist/{chunk-NLMNWKVM.js → chunk-WSI4K3WB.js} +2 -2
  18. package/dist/{chunk-PALJO75S.js → chunk-XEL6UP7C.js} +2 -2
  19. package/dist/{chunk-SZSBQUIJ.js → chunk-Y2CPBYKH.js} +3 -3
  20. package/dist/{chunk-QHF6EQKK.js → chunk-YTMXBHFM.js} +2 -2
  21. package/dist/cli.js +3 -3
  22. package/dist/{control-rJhEDdpy.d.ts → control-BFpqHFV2.d.ts} +5 -5
  23. package/dist/{control-runtime-BRdQ0wrx.d.ts → control-runtime-BZ_lVLYW.d.ts} +2 -2
  24. package/dist/control.d.ts +8 -8
  25. package/dist/control.js +3 -3
  26. package/dist/{dataset-CiK_3LDr.d.ts → dataset-ueRVTUoY.d.ts} +1 -1
  27. package/dist/{emitter-BqjeOvJh.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  28. package/dist/{errors-BZ9sTdz7.d.ts → errors-mje_cKOs.d.ts} +1 -1
  29. package/dist/{failure-cluster-D1NZKqYu.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -1
  30. package/dist/{feedback-trajectory-j0nJFgC6.d.ts → feedback-trajectory-iATEAHmc.d.ts} +2 -2
  31. package/dist/governance/index.d.ts +4 -4
  32. package/dist/{index-Cgt3DKXr.d.ts → index-DPILdKbP.d.ts} +2 -2
  33. package/dist/{index--fVrWDiR.d.ts → index-TVjRYWRm.d.ts} +1 -1
  34. package/dist/index.d.ts +108 -38
  35. package/dist/index.js +159 -14
  36. package/dist/index.js.map +1 -1
  37. package/dist/{integrity-BAxLGJ9I.d.ts → integrity-DYR5gWlb.d.ts} +2 -2
  38. package/dist/knowledge/index.d.ts +3 -3
  39. package/dist/meta-eval/index.d.ts +4 -4
  40. package/dist/openapi.json +1 -1
  41. package/dist/optimization.d.ts +11 -11
  42. package/dist/optimization.js +8 -8
  43. package/dist/pipelines/index.d.ts +6 -6
  44. package/dist/pipelines/index.js +3 -3
  45. package/dist/prm/index.d.ts +4 -4
  46. package/dist/{query-BFDT0kX_.d.ts → query-DODUYdPg.d.ts} +1 -1
  47. package/dist/{release-report-PWhGlpfO.d.ts → release-report-C8r4Vben.d.ts} +3 -3
  48. package/dist/reporting.d.ts +8 -8
  49. package/dist/reporting.js +4 -4
  50. package/dist/{researcher-ClDX3KZx.d.ts → researcher-BmgJ_901.d.ts} +6 -6
  51. package/dist/rl.d.ts +10 -10
  52. package/dist/rl.js +6 -6
  53. package/dist/{rubric-DgSqjqqj.d.ts → rubric-D5tjHNJQ.d.ts} +2 -2
  54. package/dist/{rubric-predictive-validity-C0uDYwG6.d.ts → rubric-predictive-validity-Bm-CbN46.d.ts} +1 -1
  55. package/dist/{run-record-CqzahIbx.d.ts → run-record-nYf9x2hU.d.ts} +1 -1
  56. package/dist/{store-BP5be6s7.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  57. package/dist/{summary-report-jrSGb2xZ.d.ts → summary-report-dir7A-eQ.d.ts} +2 -2
  58. package/dist/{test-graded-scenario-BJ54PDan.d.ts → test-graded-scenario-B2kWEdh9.d.ts} +2 -2
  59. package/dist/traces.d.ts +533 -10
  60. package/dist/traces.js +14 -300
  61. package/dist/traces.js.map +1 -1
  62. package/dist/{trajectory-BFmveYZt.d.ts → trajectory-CnoBo-JY.d.ts} +1 -1
  63. package/dist/wire/index.d.ts +6 -6
  64. package/dist/wire/index.js +3 -3
  65. package/package.json +12 -21
  66. package/dist/chunk-NG236HPC.js.map +0 -1
  67. package/dist/chunk-UW4NOOZI.js.map +0 -1
  68. package/dist/replay-BX5Fm8en.d.ts +0 -529
  69. /package/dist/{chunk-R5UQJNKC.js.map → chunk-4L3WJXQJ.js.map} +0 -0
  70. /package/dist/{chunk-RUI6SIHY.js.map → chunk-75ZREHD7.js.map} +0 -0
  71. /package/dist/{chunk-5AKPEK5L.js.map → chunk-CXJOVDJR.js.map} +0 -0
  72. /package/dist/{chunk-K33INZHH.js.map → chunk-GVQT44CS.js.map} +0 -0
  73. /package/dist/{chunk-4S4BM3QQ.js.map → chunk-M6RZ5LJN.js.map} +0 -0
  74. /package/dist/{chunk-XFZCM5Z3.js.map → chunk-SMSGXM74.js.map} +0 -0
  75. /package/dist/{chunk-KTGTIOFD.js.map → chunk-UBPIXOC4.js.map} +0 -0
  76. /package/dist/{chunk-DBIGN5MJ.js.map → chunk-WGXZAQLR.js.map} +0 -0
  77. /package/dist/{chunk-NLMNWKVM.js.map → chunk-WSI4K3WB.js.map} +0 -0
  78. /package/dist/{chunk-PALJO75S.js.map → chunk-XEL6UP7C.js.map} +0 -0
  79. /package/dist/{chunk-SZSBQUIJ.js.map → chunk-Y2CPBYKH.js.map} +0 -0
  80. /package/dist/{chunk-QHF6EQKK.js.map → chunk-YTMXBHFM.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-K33INZHH.js";
14
+ } from "./chunk-GVQT44CS.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -28,7 +28,7 @@ import {
28
28
  pytestTestParser,
29
29
  runTestGradedScenario,
30
30
  vitestTestParser
31
- } from "./chunk-QHF6EQKK.js";
31
+ } from "./chunk-YTMXBHFM.js";
32
32
  import {
33
33
  classifyEuAiRisk,
34
34
  euAiActReport,
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-PALJO75S.js";
57
+ } from "./chunk-XEL6UP7C.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -96,14 +96,14 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-SZSBQUIJ.js";
99
+ } from "./chunk-Y2CPBYKH.js";
100
100
  import {
101
101
  RunRecordValidationError,
102
102
  isRunRecord,
103
103
  parseRunRecordSafe,
104
104
  roundTripRunRecord,
105
105
  validateRunRecord
106
- } from "./chunk-NLMNWKVM.js";
106
+ } from "./chunk-WSI4K3WB.js";
107
107
  import {
108
108
  assertReleaseConfidence,
109
109
  bootstrapCi,
@@ -111,10 +111,10 @@ import {
111
111
  judgeReplayGate,
112
112
  releaseTraceEvidenceFromMultiShotTrials,
113
113
  renderReleaseReport
114
- } from "./chunk-DBIGN5MJ.js";
114
+ } from "./chunk-WGXZAQLR.js";
115
115
  import {
116
116
  runEvalCampaign
117
- } from "./chunk-RUI6SIHY.js";
117
+ } from "./chunk-75ZREHD7.js";
118
118
  import {
119
119
  LlmCallError,
120
120
  LlmClient,
@@ -124,7 +124,7 @@ import {
124
124
  callLlmJson,
125
125
  probeLlm,
126
126
  stripFencedJson
127
- } from "./chunk-4S4BM3QQ.js";
127
+ } from "./chunk-M6RZ5LJN.js";
128
128
  import {
129
129
  evaluateInterimReleaseConfidence,
130
130
  pairedEvalueSequence
@@ -141,7 +141,7 @@ import {
141
141
  requiredSampleSize,
142
142
  researchReport,
143
143
  summaryTable
144
- } from "./chunk-5AKPEK5L.js";
144
+ } from "./chunk-CXJOVDJR.js";
145
145
  import {
146
146
  calibrateJudge,
147
147
  calibrateJudgeContinuous,
@@ -160,24 +160,43 @@ import {
160
160
  verbosityBias,
161
161
  weightedMean,
162
162
  wilcoxonSignedRank
163
- } from "./chunk-R5UQJNKC.js";
163
+ } from "./chunk-4L3WJXQJ.js";
164
164
  import {
165
165
  DEFAULT_REDACTION_RULES,
166
+ DEFAULT_TRACE_ANALYST_BUDGETS,
166
167
  FileSystemTraceStore,
167
168
  InMemoryTraceStore,
168
169
  OTEL_AGENT_EVAL_SCOPE,
170
+ OtlpFileTraceStore,
169
171
  REDACTION_VERSION,
170
172
  ReplayCache,
171
173
  ReplayCacheMissError,
174
+ SpanNotFoundError,
175
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
176
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
177
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
178
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
172
179
  TraceFileMissingError,
180
+ TraceNotFoundError,
173
181
  analyzeTraces,
174
182
  buildTraceAnalystTools,
183
+ buildTraceInsightContext,
184
+ buildTraceInsightPrompt,
175
185
  createReplayFetch,
186
+ defaultTraceInsightPanel,
187
+ describeTraceInsightScope,
188
+ domainEvidencePattern,
176
189
  exportRunAsOtlp,
190
+ inferDomainKeywords,
177
191
  iterateRawCalls,
192
+ planTraceInsightQuestions,
178
193
  redactString,
179
- redactValue
180
- } from "./chunk-UW4NOOZI.js";
194
+ redactValue,
195
+ scoreTraceInsightReadiness,
196
+ tokenizeDomainWords,
197
+ traceAnalystFunctionGroup,
198
+ traceAnalystOnRunComplete
199
+ } from "./chunk-HIO4UIS5.js";
181
200
  import {
182
201
  aggregateLlm,
183
202
  argHash,
@@ -201,7 +220,7 @@ import {
201
220
  RunIntegrityError,
202
221
  assertRunCaptured,
203
222
  throwIfRunIncomplete
204
- } from "./chunk-KTGTIOFD.js";
223
+ } from "./chunk-UBPIXOC4.js";
205
224
  import {
206
225
  FileSystemRawProviderSink,
207
226
  InMemoryRawProviderSink,
@@ -229,7 +248,7 @@ import {
229
248
  ReplayError,
230
249
  ValidationError,
231
250
  VerificationError
232
- } from "./chunk-NG236HPC.js";
251
+ } from "./chunk-QYJT52YW.js";
233
252
  import "./chunk-PZ5AY32C.js";
234
253
 
235
254
  // src/run-score.ts
@@ -3237,6 +3256,107 @@ function suggestionForManifest(input) {
3237
3256
  return "No action required.";
3238
3257
  }
3239
3258
 
3259
+ // src/integrity/backend-integrity.ts
3260
+ var BackendIntegrityError = class extends AgentEvalError {
3261
+ constructor(message, report) {
3262
+ super("backend_integrity", message);
3263
+ this.report = report;
3264
+ }
3265
+ report;
3266
+ };
3267
+ function isStubRecord(rec) {
3268
+ return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0;
3269
+ }
3270
+ function isUncostedRecord(rec) {
3271
+ return rec.tokenUsage.output > 0 && rec.costUsd === 0;
3272
+ }
3273
+ function summarizeBackendIntegrity(records) {
3274
+ const totalRecords = records.length;
3275
+ let stubRecords = 0;
3276
+ let realRecords = 0;
3277
+ let uncostedRecords = 0;
3278
+ let totalInputTokens = 0;
3279
+ let totalOutputTokens = 0;
3280
+ let totalCostUsd = 0;
3281
+ for (const rec of records) {
3282
+ totalInputTokens += rec.tokenUsage.input;
3283
+ totalOutputTokens += rec.tokenUsage.output;
3284
+ totalCostUsd += rec.costUsd;
3285
+ if (isStubRecord(rec)) stubRecords++;
3286
+ else realRecords++;
3287
+ if (isUncostedRecord(rec)) uncostedRecords++;
3288
+ }
3289
+ const verdict = totalRecords === 0 ? "stub" : stubRecords === totalRecords ? "stub" : stubRecords === 0 ? "real" : "mixed";
3290
+ const diagnosis = buildDiagnosis({
3291
+ totalRecords,
3292
+ stubRecords,
3293
+ realRecords,
3294
+ uncostedRecords,
3295
+ totalInputTokens,
3296
+ totalOutputTokens,
3297
+ totalCostUsd,
3298
+ verdict
3299
+ });
3300
+ return {
3301
+ totalRecords,
3302
+ stubRecords,
3303
+ realRecords,
3304
+ uncostedRecords,
3305
+ totalInputTokens,
3306
+ totalOutputTokens,
3307
+ totalCostUsd,
3308
+ verdict,
3309
+ diagnosis
3310
+ };
3311
+ }
3312
+ function buildDiagnosis(r) {
3313
+ if (r.totalRecords === 0) {
3314
+ return "no records \u2014 eval produced zero runs; backend likely failed before first turn";
3315
+ }
3316
+ if (r.verdict === "stub") {
3317
+ return [
3318
+ `all ${r.totalRecords} records have zero token usage \u2014 the LLM backend was never called.`,
3319
+ "common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;",
3320
+ "auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,",
3321
+ "or boot the cli-bridge / sandbox before invoking the eval."
3322
+ ].join(" ");
3323
+ }
3324
+ if (r.verdict === "mixed") {
3325
+ const pct = (r.stubRecords / r.totalRecords * 100).toFixed(0);
3326
+ return [
3327
+ `${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage \u2014 the backend partially failed.`,
3328
+ "common causes: rate-limit cascade (429s after the first N personas);",
3329
+ "transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures."
3330
+ ].join(" ");
3331
+ }
3332
+ if (r.uncostedRecords > 0) {
3333
+ const pct = (r.uncostedRecords / r.totalRecords * 100).toFixed(0);
3334
+ return [
3335
+ `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
3336
+ `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 \u2014 cost ledger is mis-wired (no input-token`,
3337
+ "propagation from the runtime stream into RunRecord)."
3338
+ ].join(" ");
3339
+ }
3340
+ return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`;
3341
+ }
3342
+ function assertRealBackend(records, opts = {}) {
3343
+ const report = summarizeBackendIntegrity(records);
3344
+ const allowMixed = opts.allowMixed ?? true;
3345
+ if (report.verdict === "stub") {
3346
+ throw new BackendIntegrityError(
3347
+ `backend-integrity: ran against a stub or unconfigured backend \u2014 ${report.diagnosis}`,
3348
+ report
3349
+ );
3350
+ }
3351
+ if (!allowMixed && report.verdict === "mixed") {
3352
+ throw new BackendIntegrityError(
3353
+ `backend-integrity: partial backend failure rejected \u2014 ${report.diagnosis}`,
3354
+ report
3355
+ );
3356
+ }
3357
+ return report;
3358
+ }
3359
+
3240
3360
  // src/judges.ts
3241
3361
  function createDomainExpertJudge(domain) {
3242
3362
  return async (tc, { scenario, turns }) => {
@@ -10299,6 +10419,7 @@ export {
10299
10419
  AnalystRegistry,
10300
10420
  AxGepaSteeringOptimizer,
10301
10421
  BENCHMARK_SPLIT_SEED,
10422
+ BackendIntegrityError,
10302
10423
  BenchmarkRunner,
10303
10424
  BudgetBreachError,
10304
10425
  BudgetGuard,
@@ -10320,6 +10441,7 @@ export {
10320
10441
  DEFAULT_RED_TEAM_CORPUS,
10321
10442
  DEFAULT_RUN_SCORE_WEIGHTS,
10322
10443
  DEFAULT_SEVERITY_WEIGHTS,
10444
+ DEFAULT_TRACE_ANALYST_BUDGETS,
10323
10445
  DEFAULT_TRACE_ANALYST_KINDS,
10324
10446
  Dataset,
10325
10447
  DockerSandboxDriver,
@@ -10367,6 +10489,7 @@ export {
10367
10489
  NoopResearcher,
10368
10490
  NotFoundError,
10369
10491
  OTEL_AGENT_EVAL_SCOPE,
10492
+ OtlpFileTraceStore,
10370
10493
  PairwiseSteeringOptimizer,
10371
10494
  ProductClient,
10372
10495
  PromptRegistry,
@@ -10383,10 +10506,17 @@ export {
10383
10506
  SEMANTIC_CONCEPT_JUDGE_VERSION,
10384
10507
  SandboxHarness,
10385
10508
  ScenarioRegistry,
10509
+ SpanNotFoundError,
10386
10510
  SubprocessSandboxDriver,
10511
+ TRACE_ANALYST_ACTOR_DESCRIPTION,
10512
+ TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION,
10513
+ TRACE_ANALYST_SUBAGENT_DESCRIPTION,
10514
+ TRACE_ANALYST_TRUNCATION_MARKER_PREFIX,
10387
10515
  TRACE_SCHEMA_VERSION,
10388
10516
  TokenCounter,
10389
10517
  TraceEmitter,
10518
+ TraceFileMissingError,
10519
+ TraceNotFoundError,
10390
10520
  TrialTelemetry,
10391
10521
  UNIVERSAL_FINDERS,
10392
10522
  ValidationError,
@@ -10399,8 +10529,10 @@ export {
10399
10529
  allCriticalPassed,
10400
10530
  analyzeAntiSlop,
10401
10531
  analyzeSeries,
10532
+ analyzeTraces,
10402
10533
  argHash,
10403
10534
  assertLlmRoute,
10535
+ assertRealBackend,
10404
10536
  assertReleaseConfidence,
10405
10537
  assertRunCaptured,
10406
10538
  assignFeedbackSplit,
@@ -10415,6 +10547,9 @@ export {
10415
10547
  bootstrapCi,
10416
10548
  buildReflectionPrompt,
10417
10549
  buildReviewerPrompt,
10550
+ buildTraceAnalystTools,
10551
+ buildTraceInsightContext,
10552
+ buildTraceInsightPrompt,
10418
10553
  buildTraceToolsForGroup,
10419
10554
  buildTrajectory,
10420
10555
  byteLengthRange,
@@ -10479,10 +10614,13 @@ export {
10479
10614
  defaultMultiShotObjectives,
10480
10615
  defaultProviderRedactor,
10481
10616
  defaultReferenceReplayMatcher,
10617
+ defaultTraceInsightPanel,
10482
10618
  deployGateLayer,
10619
+ describeTraceInsightScope,
10483
10620
  diffFindings,
10484
10621
  discoverPersonas,
10485
10622
  distillPlaybook,
10623
+ domainEvidencePattern,
10486
10624
  dominates,
10487
10625
  estimateCost,
10488
10626
  estimateTokens,
@@ -10526,6 +10664,7 @@ export {
10526
10664
  httpGithubClient,
10527
10665
  inMemoryReferenceReplayStore,
10528
10666
  inMemoryReviewStore,
10667
+ inferDomainKeywords,
10529
10668
  integrationAsi,
10530
10669
  integrationGateEvals,
10531
10670
  integrationInvokeFailedPayload,
@@ -10583,6 +10722,7 @@ export {
10583
10722
  partialCredit,
10584
10723
  passOrthogonality,
10585
10724
  pixelDeltaRatio,
10725
+ planTraceInsightQuestions,
10586
10726
  politenessPrefixMutator,
10587
10727
  positionalBias,
10588
10728
  printDriverSummary,
@@ -10651,6 +10791,7 @@ export {
10651
10791
  scoreKnowledgeReadiness,
10652
10792
  scoreRedTeamOutput,
10653
10793
  scoreReferenceReplay,
10794
+ scoreTraceInsightReadiness,
10654
10795
  securityJudge,
10655
10796
  selectHarnessVariant,
10656
10797
  selfPreference,
@@ -10664,6 +10805,7 @@ export {
10664
10805
  stripFencedJson,
10665
10806
  subjectiveEval,
10666
10807
  summarize,
10808
+ summarizeBackendIntegrity,
10667
10809
  summarizeHarnessResults,
10668
10810
  summarizePreferenceMemory,
10669
10811
  summaryTable,
@@ -10672,8 +10814,11 @@ export {
10672
10814
  throwIfRunIncomplete,
10673
10815
  toLangfuseEnvelope,
10674
10816
  toPrometheusText,
10817
+ tokenizeDomainWords,
10675
10818
  toolNamesForRun,
10676
10819
  toolSpans,
10820
+ traceAnalystFunctionGroup,
10821
+ traceAnalystOnRunComplete,
10677
10822
  trialTraceFromMultiShotTrial,
10678
10823
  typoMutator,
10679
10824
  urlContains,