@tangle-network/agent-eval 0.36.0 → 0.38.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -195,6 +195,8 @@ import {
195
195
  buildTraceAnalystTools,
196
196
  buildTraceInsightContext,
197
197
  buildTraceInsightPrompt,
198
+ createOtelExporter,
199
+ createOtelTracingStore,
198
200
  createReplayFetch,
199
201
  defaultTraceInsightPanel,
200
202
  describeTraceInsightScope,
@@ -202,6 +204,7 @@ import {
202
204
  exportRunAsOtlp,
203
205
  inferDomainKeywords,
204
206
  iterateRawCalls,
207
+ otelRunCompleteHook,
205
208
  planTraceInsightQuestions,
206
209
  redactString,
207
210
  redactValue,
@@ -209,7 +212,7 @@ import {
209
212
  tokenizeDomainWords,
210
213
  traceAnalystFunctionGroup,
211
214
  traceAnalystOnRunComplete
212
- } from "./chunk-HIO4UIS5.js";
215
+ } from "./chunk-L5UNCDAJ.js";
213
216
  import {
214
217
  aggregateLlm,
215
218
  argHash,
@@ -11031,6 +11034,197 @@ function aggregateTrialsByMode(trials, opts) {
11031
11034
  metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
11032
11035
  };
11033
11036
  }
11037
+
11038
+ // src/otel-pipeline.ts
11039
+ function withOtelPipeline(opts) {
11040
+ const config = {
11041
+ ...opts?.otelConfig,
11042
+ resourceAttributes: {
11043
+ ...opts?.pipelineKind ? { "pipeline.kind": opts.pipelineKind } : {},
11044
+ ...opts?.pipelineId ? { "pipeline.id": opts.pipelineId } : {},
11045
+ ...opts?.otelConfig?.resourceAttributes
11046
+ }
11047
+ };
11048
+ const exporter = createOtelExporter(config);
11049
+ return {
11050
+ exporter,
11051
+ async shutdown() {
11052
+ if (exporter) await exporter.shutdown();
11053
+ }
11054
+ };
11055
+ }
11056
+ function isOtelConfigured() {
11057
+ return !!(typeof process !== "undefined" && process.env.OTEL_EXPORTER_OTLP_ENDPOINT);
11058
+ }
11059
+
11060
+ // src/traced-judges.ts
11061
+ function traceJudge(judge, judgeName, opts) {
11062
+ return async (tc, input) => {
11063
+ const span = await opts.emitter.span({
11064
+ kind: "llm",
11065
+ name: `judge:${judgeName}`,
11066
+ parentSpanId: opts.parentSpanId,
11067
+ attributes: {
11068
+ "judge.name": judgeName,
11069
+ "eval.phase": "judge"
11070
+ }
11071
+ });
11072
+ try {
11073
+ const scores = await judge(tc, input);
11074
+ const composite = scores.length > 0 ? scores.reduce((sum3, s) => sum3 + s.score, 0) / scores.length : 0;
11075
+ await span.end({
11076
+ attributes: {
11077
+ "judge.name": judgeName,
11078
+ "judge.composite_score": composite,
11079
+ "judge.dimension_count": scores.length,
11080
+ "eval.phase": "judge"
11081
+ }
11082
+ });
11083
+ return scores;
11084
+ } catch (err) {
11085
+ await span.fail(err instanceof Error ? err : String(err));
11086
+ throw err;
11087
+ }
11088
+ };
11089
+ }
11090
+ function traceJudgeEnsemble(judges, judgeNames, opts) {
11091
+ return async (tc, input) => {
11092
+ const ensembleSpan = await opts.emitter.span({
11093
+ kind: "custom",
11094
+ name: "judge:ensemble",
11095
+ parentSpanId: opts.parentSpanId,
11096
+ attributes: {
11097
+ "judge.ensemble_size": judges.length,
11098
+ "eval.phase": "judge"
11099
+ }
11100
+ });
11101
+ try {
11102
+ const allScores = [];
11103
+ for (let i = 0; i < judges.length; i++) {
11104
+ const judge = judges[i];
11105
+ const name = judgeNames[i] ?? `judge_${i}`;
11106
+ const tracedFn = traceJudge(judge, name, {
11107
+ emitter: opts.emitter,
11108
+ parentSpanId: ensembleSpan.span.spanId
11109
+ });
11110
+ const scores = await tracedFn(tc, input);
11111
+ allScores.push(...scores);
11112
+ }
11113
+ const composite = allScores.length > 0 ? allScores.reduce((sum3, s) => sum3 + s.score, 0) / allScores.length : 0;
11114
+ await ensembleSpan.end({
11115
+ attributes: {
11116
+ "judge.ensemble_size": judges.length,
11117
+ "judge.composite_score": composite,
11118
+ "judge.total_dimensions": allScores.length,
11119
+ "eval.phase": "judge"
11120
+ }
11121
+ });
11122
+ return allScores;
11123
+ } catch (err) {
11124
+ await ensembleSpan.fail(err instanceof Error ? err : String(err));
11125
+ throw err;
11126
+ }
11127
+ };
11128
+ }
11129
+
11130
+ // src/traced-analyst.ts
11131
+ async function tracedAnalyzeTraces(input, options, traceOpts) {
11132
+ const parentSpan = await traceOpts.emitter.span({
11133
+ kind: "custom",
11134
+ name: "analyst:analyze-traces",
11135
+ parentSpanId: traceOpts.parentSpanId,
11136
+ attributes: {
11137
+ "analyst.question_length": input.question.length,
11138
+ "analyst.max_turns": options.maxTurns ?? 12,
11139
+ "analyst.max_depth": options.maxDepth ?? 1,
11140
+ "eval.phase": "analyst"
11141
+ }
11142
+ });
11143
+ const originalOnTurn = options.onTurn;
11144
+ const turnSpanIds = [];
11145
+ const wrappedOptions = {
11146
+ ...options,
11147
+ onTurn: async (turn) => {
11148
+ const turnSpan = await traceOpts.emitter.span({
11149
+ kind: "custom",
11150
+ name: `analyst:turn-${turn.turn}`,
11151
+ parentSpanId: parentSpan.span.spanId,
11152
+ attributes: {
11153
+ "analyst.turn": turn.turn,
11154
+ "analyst.is_error": turn.isError,
11155
+ "analyst.code_length": turn.code.length,
11156
+ "analyst.output_length": turn.output.length,
11157
+ "eval.phase": "analyst"
11158
+ }
11159
+ });
11160
+ turnSpanIds.push(turnSpan.span.spanId);
11161
+ if (turn.isError) {
11162
+ await turnSpan.fail("Turn produced an error");
11163
+ } else {
11164
+ await turnSpan.end();
11165
+ }
11166
+ if (originalOnTurn) await originalOnTurn(turn);
11167
+ }
11168
+ };
11169
+ try {
11170
+ const result = await analyzeTraces(input, wrappedOptions);
11171
+ await parentSpan.end({
11172
+ attributes: {
11173
+ "analyst.question_length": input.question.length,
11174
+ "analyst.turn_count": result.turnCount,
11175
+ "analyst.finding_count": result.findings.length,
11176
+ "analyst.answer_length": result.answer.length,
11177
+ "eval.phase": "analyst"
11178
+ }
11179
+ });
11180
+ return result;
11181
+ } catch (err) {
11182
+ await parentSpan.fail(err instanceof Error ? err : String(err));
11183
+ throw err;
11184
+ }
11185
+ }
11186
+
11187
+ // src/traced-mutator.ts
11188
+ function traceMutator(adapter, opts) {
11189
+ return {
11190
+ async mutate(args) {
11191
+ const span = await opts.emitter.span({
11192
+ kind: "llm",
11193
+ name: `mutator:gen-${args.generation}`,
11194
+ parentSpanId: opts.parentSpanId,
11195
+ attributes: {
11196
+ "mutator.parent_id": args.parent.id,
11197
+ "mutator.generation": args.generation,
11198
+ "mutator.child_count": args.childCount,
11199
+ "mutator.top_trials": args.topTrials.length,
11200
+ "mutator.bottom_trials": args.bottomTrials.length,
11201
+ "mutator.parent_score": args.parentAggregate.meanScore,
11202
+ "eval.phase": "mutator"
11203
+ }
11204
+ });
11205
+ try {
11206
+ const children = await adapter.mutate(args);
11207
+ await span.end({
11208
+ attributes: {
11209
+ "mutator.parent_id": args.parent.id,
11210
+ "mutator.generation": args.generation,
11211
+ "mutator.child_count": args.childCount,
11212
+ "mutator.top_trials": args.topTrials.length,
11213
+ "mutator.bottom_trials": args.bottomTrials.length,
11214
+ "mutator.parent_score": args.parentAggregate.meanScore,
11215
+ "mutator.produced_count": children.length,
11216
+ "mutator.child_ids": children.map((c) => c.id).join(","),
11217
+ "eval.phase": "mutator"
11218
+ }
11219
+ });
11220
+ return children;
11221
+ } catch (err) {
11222
+ await span.fail(err instanceof Error ? err : String(err));
11223
+ throw err;
11224
+ }
11225
+ }
11226
+ };
11227
+ }
11034
11228
  export {
11035
11229
  AGENT_PROFILE_KINDS,
11036
11230
  ANALYST_SEVERITIES,
@@ -11229,6 +11423,8 @@ export {
11229
11423
  createJudgeAdapter,
11230
11424
  createLlmCorrectnessChecker,
11231
11425
  createLlmReviewer,
11426
+ createOtelExporter,
11427
+ createOtelTracingStore,
11232
11428
  createReplayFetch,
11233
11429
  createRunCriticAdapter,
11234
11430
  createSandboxCodeMutator,
@@ -11312,6 +11508,7 @@ export {
11312
11508
  iqr,
11313
11509
  isJudgeSpan,
11314
11510
  isLlmSpan,
11511
+ isOtelConfigured,
11315
11512
  isRetrievalSpan,
11316
11513
  isRunRecord,
11317
11514
  isSandboxSpan,
@@ -11345,6 +11542,7 @@ export {
11345
11542
  normalizeScores,
11346
11543
  notBlocked,
11347
11544
  objectiveEval,
11545
+ otelRunCompleteHook,
11348
11546
  pairedBootstrap,
11349
11547
  pairedEvalueSequence,
11350
11548
  pairedMde,
@@ -11467,6 +11665,10 @@ export {
11467
11665
  toolSpans,
11468
11666
  traceAnalystFunctionGroup,
11469
11667
  traceAnalystOnRunComplete,
11668
+ traceJudge,
11669
+ traceJudgeEnsemble,
11670
+ traceMutator,
11671
+ tracedAnalyzeTraces,
11470
11672
  trialTraceFromMultiShotTrial,
11471
11673
  typoMutator,
11472
11674
  urlContains,
@@ -11487,6 +11689,7 @@ export {
11487
11689
  wilcoxonSignedRank,
11488
11690
  withAssignedFeedbackSplit,
11489
11691
  withJudgeRetry,
11692
+ withOtelPipeline,
11490
11693
  wranglerDeployRunner
11491
11694
  };
11492
11695
  //# sourceMappingURL=index.js.map