@tangle-network/agent-eval 0.36.0 → 0.38.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-HIO4UIS5.js → chunk-L5UNCDAJ.js} +207 -1
- package/dist/chunk-L5UNCDAJ.js.map +1 -0
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/index.d.ts +111 -3
- package/dist/index.js +204 -1
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +467 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/traces.d.ts +81 -2
- package/dist/traces.js +7 -1
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/package.json +6 -1
- package/dist/chunk-HIO4UIS5.js.map +0 -1
package/dist/index.js
CHANGED
|
@@ -195,6 +195,8 @@ import {
|
|
|
195
195
|
buildTraceAnalystTools,
|
|
196
196
|
buildTraceInsightContext,
|
|
197
197
|
buildTraceInsightPrompt,
|
|
198
|
+
createOtelExporter,
|
|
199
|
+
createOtelTracingStore,
|
|
198
200
|
createReplayFetch,
|
|
199
201
|
defaultTraceInsightPanel,
|
|
200
202
|
describeTraceInsightScope,
|
|
@@ -202,6 +204,7 @@ import {
|
|
|
202
204
|
exportRunAsOtlp,
|
|
203
205
|
inferDomainKeywords,
|
|
204
206
|
iterateRawCalls,
|
|
207
|
+
otelRunCompleteHook,
|
|
205
208
|
planTraceInsightQuestions,
|
|
206
209
|
redactString,
|
|
207
210
|
redactValue,
|
|
@@ -209,7 +212,7 @@ import {
|
|
|
209
212
|
tokenizeDomainWords,
|
|
210
213
|
traceAnalystFunctionGroup,
|
|
211
214
|
traceAnalystOnRunComplete
|
|
212
|
-
} from "./chunk-
|
|
215
|
+
} from "./chunk-L5UNCDAJ.js";
|
|
213
216
|
import {
|
|
214
217
|
aggregateLlm,
|
|
215
218
|
argHash,
|
|
@@ -11031,6 +11034,197 @@ function aggregateTrialsByMode(trials, opts) {
|
|
|
11031
11034
|
metrics: meanMetrics(counted.map((t) => t.metrics ?? {}))
|
|
11032
11035
|
};
|
|
11033
11036
|
}
|
|
11037
|
+
|
|
11038
|
+
// src/otel-pipeline.ts
|
|
11039
|
+
function withOtelPipeline(opts) {
|
|
11040
|
+
const config = {
|
|
11041
|
+
...opts?.otelConfig,
|
|
11042
|
+
resourceAttributes: {
|
|
11043
|
+
...opts?.pipelineKind ? { "pipeline.kind": opts.pipelineKind } : {},
|
|
11044
|
+
...opts?.pipelineId ? { "pipeline.id": opts.pipelineId } : {},
|
|
11045
|
+
...opts?.otelConfig?.resourceAttributes
|
|
11046
|
+
}
|
|
11047
|
+
};
|
|
11048
|
+
const exporter = createOtelExporter(config);
|
|
11049
|
+
return {
|
|
11050
|
+
exporter,
|
|
11051
|
+
async shutdown() {
|
|
11052
|
+
if (exporter) await exporter.shutdown();
|
|
11053
|
+
}
|
|
11054
|
+
};
|
|
11055
|
+
}
|
|
11056
|
+
function isOtelConfigured() {
|
|
11057
|
+
return !!(typeof process !== "undefined" && process.env.OTEL_EXPORTER_OTLP_ENDPOINT);
|
|
11058
|
+
}
|
|
11059
|
+
|
|
11060
|
+
// src/traced-judges.ts
|
|
11061
|
+
function traceJudge(judge, judgeName, opts) {
|
|
11062
|
+
return async (tc, input) => {
|
|
11063
|
+
const span = await opts.emitter.span({
|
|
11064
|
+
kind: "llm",
|
|
11065
|
+
name: `judge:${judgeName}`,
|
|
11066
|
+
parentSpanId: opts.parentSpanId,
|
|
11067
|
+
attributes: {
|
|
11068
|
+
"judge.name": judgeName,
|
|
11069
|
+
"eval.phase": "judge"
|
|
11070
|
+
}
|
|
11071
|
+
});
|
|
11072
|
+
try {
|
|
11073
|
+
const scores = await judge(tc, input);
|
|
11074
|
+
const composite = scores.length > 0 ? scores.reduce((sum3, s) => sum3 + s.score, 0) / scores.length : 0;
|
|
11075
|
+
await span.end({
|
|
11076
|
+
attributes: {
|
|
11077
|
+
"judge.name": judgeName,
|
|
11078
|
+
"judge.composite_score": composite,
|
|
11079
|
+
"judge.dimension_count": scores.length,
|
|
11080
|
+
"eval.phase": "judge"
|
|
11081
|
+
}
|
|
11082
|
+
});
|
|
11083
|
+
return scores;
|
|
11084
|
+
} catch (err) {
|
|
11085
|
+
await span.fail(err instanceof Error ? err : String(err));
|
|
11086
|
+
throw err;
|
|
11087
|
+
}
|
|
11088
|
+
};
|
|
11089
|
+
}
|
|
11090
|
+
function traceJudgeEnsemble(judges, judgeNames, opts) {
|
|
11091
|
+
return async (tc, input) => {
|
|
11092
|
+
const ensembleSpan = await opts.emitter.span({
|
|
11093
|
+
kind: "custom",
|
|
11094
|
+
name: "judge:ensemble",
|
|
11095
|
+
parentSpanId: opts.parentSpanId,
|
|
11096
|
+
attributes: {
|
|
11097
|
+
"judge.ensemble_size": judges.length,
|
|
11098
|
+
"eval.phase": "judge"
|
|
11099
|
+
}
|
|
11100
|
+
});
|
|
11101
|
+
try {
|
|
11102
|
+
const allScores = [];
|
|
11103
|
+
for (let i = 0; i < judges.length; i++) {
|
|
11104
|
+
const judge = judges[i];
|
|
11105
|
+
const name = judgeNames[i] ?? `judge_${i}`;
|
|
11106
|
+
const tracedFn = traceJudge(judge, name, {
|
|
11107
|
+
emitter: opts.emitter,
|
|
11108
|
+
parentSpanId: ensembleSpan.span.spanId
|
|
11109
|
+
});
|
|
11110
|
+
const scores = await tracedFn(tc, input);
|
|
11111
|
+
allScores.push(...scores);
|
|
11112
|
+
}
|
|
11113
|
+
const composite = allScores.length > 0 ? allScores.reduce((sum3, s) => sum3 + s.score, 0) / allScores.length : 0;
|
|
11114
|
+
await ensembleSpan.end({
|
|
11115
|
+
attributes: {
|
|
11116
|
+
"judge.ensemble_size": judges.length,
|
|
11117
|
+
"judge.composite_score": composite,
|
|
11118
|
+
"judge.total_dimensions": allScores.length,
|
|
11119
|
+
"eval.phase": "judge"
|
|
11120
|
+
}
|
|
11121
|
+
});
|
|
11122
|
+
return allScores;
|
|
11123
|
+
} catch (err) {
|
|
11124
|
+
await ensembleSpan.fail(err instanceof Error ? err : String(err));
|
|
11125
|
+
throw err;
|
|
11126
|
+
}
|
|
11127
|
+
};
|
|
11128
|
+
}
|
|
11129
|
+
|
|
11130
|
+
// src/traced-analyst.ts
|
|
11131
|
+
async function tracedAnalyzeTraces(input, options, traceOpts) {
|
|
11132
|
+
const parentSpan = await traceOpts.emitter.span({
|
|
11133
|
+
kind: "custom",
|
|
11134
|
+
name: "analyst:analyze-traces",
|
|
11135
|
+
parentSpanId: traceOpts.parentSpanId,
|
|
11136
|
+
attributes: {
|
|
11137
|
+
"analyst.question_length": input.question.length,
|
|
11138
|
+
"analyst.max_turns": options.maxTurns ?? 12,
|
|
11139
|
+
"analyst.max_depth": options.maxDepth ?? 1,
|
|
11140
|
+
"eval.phase": "analyst"
|
|
11141
|
+
}
|
|
11142
|
+
});
|
|
11143
|
+
const originalOnTurn = options.onTurn;
|
|
11144
|
+
const turnSpanIds = [];
|
|
11145
|
+
const wrappedOptions = {
|
|
11146
|
+
...options,
|
|
11147
|
+
onTurn: async (turn) => {
|
|
11148
|
+
const turnSpan = await traceOpts.emitter.span({
|
|
11149
|
+
kind: "custom",
|
|
11150
|
+
name: `analyst:turn-${turn.turn}`,
|
|
11151
|
+
parentSpanId: parentSpan.span.spanId,
|
|
11152
|
+
attributes: {
|
|
11153
|
+
"analyst.turn": turn.turn,
|
|
11154
|
+
"analyst.is_error": turn.isError,
|
|
11155
|
+
"analyst.code_length": turn.code.length,
|
|
11156
|
+
"analyst.output_length": turn.output.length,
|
|
11157
|
+
"eval.phase": "analyst"
|
|
11158
|
+
}
|
|
11159
|
+
});
|
|
11160
|
+
turnSpanIds.push(turnSpan.span.spanId);
|
|
11161
|
+
if (turn.isError) {
|
|
11162
|
+
await turnSpan.fail("Turn produced an error");
|
|
11163
|
+
} else {
|
|
11164
|
+
await turnSpan.end();
|
|
11165
|
+
}
|
|
11166
|
+
if (originalOnTurn) await originalOnTurn(turn);
|
|
11167
|
+
}
|
|
11168
|
+
};
|
|
11169
|
+
try {
|
|
11170
|
+
const result = await analyzeTraces(input, wrappedOptions);
|
|
11171
|
+
await parentSpan.end({
|
|
11172
|
+
attributes: {
|
|
11173
|
+
"analyst.question_length": input.question.length,
|
|
11174
|
+
"analyst.turn_count": result.turnCount,
|
|
11175
|
+
"analyst.finding_count": result.findings.length,
|
|
11176
|
+
"analyst.answer_length": result.answer.length,
|
|
11177
|
+
"eval.phase": "analyst"
|
|
11178
|
+
}
|
|
11179
|
+
});
|
|
11180
|
+
return result;
|
|
11181
|
+
} catch (err) {
|
|
11182
|
+
await parentSpan.fail(err instanceof Error ? err : String(err));
|
|
11183
|
+
throw err;
|
|
11184
|
+
}
|
|
11185
|
+
}
|
|
11186
|
+
|
|
11187
|
+
// src/traced-mutator.ts
|
|
11188
|
+
function traceMutator(adapter, opts) {
|
|
11189
|
+
return {
|
|
11190
|
+
async mutate(args) {
|
|
11191
|
+
const span = await opts.emitter.span({
|
|
11192
|
+
kind: "llm",
|
|
11193
|
+
name: `mutator:gen-${args.generation}`,
|
|
11194
|
+
parentSpanId: opts.parentSpanId,
|
|
11195
|
+
attributes: {
|
|
11196
|
+
"mutator.parent_id": args.parent.id,
|
|
11197
|
+
"mutator.generation": args.generation,
|
|
11198
|
+
"mutator.child_count": args.childCount,
|
|
11199
|
+
"mutator.top_trials": args.topTrials.length,
|
|
11200
|
+
"mutator.bottom_trials": args.bottomTrials.length,
|
|
11201
|
+
"mutator.parent_score": args.parentAggregate.meanScore,
|
|
11202
|
+
"eval.phase": "mutator"
|
|
11203
|
+
}
|
|
11204
|
+
});
|
|
11205
|
+
try {
|
|
11206
|
+
const children = await adapter.mutate(args);
|
|
11207
|
+
await span.end({
|
|
11208
|
+
attributes: {
|
|
11209
|
+
"mutator.parent_id": args.parent.id,
|
|
11210
|
+
"mutator.generation": args.generation,
|
|
11211
|
+
"mutator.child_count": args.childCount,
|
|
11212
|
+
"mutator.top_trials": args.topTrials.length,
|
|
11213
|
+
"mutator.bottom_trials": args.bottomTrials.length,
|
|
11214
|
+
"mutator.parent_score": args.parentAggregate.meanScore,
|
|
11215
|
+
"mutator.produced_count": children.length,
|
|
11216
|
+
"mutator.child_ids": children.map((c) => c.id).join(","),
|
|
11217
|
+
"eval.phase": "mutator"
|
|
11218
|
+
}
|
|
11219
|
+
});
|
|
11220
|
+
return children;
|
|
11221
|
+
} catch (err) {
|
|
11222
|
+
await span.fail(err instanceof Error ? err : String(err));
|
|
11223
|
+
throw err;
|
|
11224
|
+
}
|
|
11225
|
+
}
|
|
11226
|
+
};
|
|
11227
|
+
}
|
|
11034
11228
|
export {
|
|
11035
11229
|
AGENT_PROFILE_KINDS,
|
|
11036
11230
|
ANALYST_SEVERITIES,
|
|
@@ -11229,6 +11423,8 @@ export {
|
|
|
11229
11423
|
createJudgeAdapter,
|
|
11230
11424
|
createLlmCorrectnessChecker,
|
|
11231
11425
|
createLlmReviewer,
|
|
11426
|
+
createOtelExporter,
|
|
11427
|
+
createOtelTracingStore,
|
|
11232
11428
|
createReplayFetch,
|
|
11233
11429
|
createRunCriticAdapter,
|
|
11234
11430
|
createSandboxCodeMutator,
|
|
@@ -11312,6 +11508,7 @@ export {
|
|
|
11312
11508
|
iqr,
|
|
11313
11509
|
isJudgeSpan,
|
|
11314
11510
|
isLlmSpan,
|
|
11511
|
+
isOtelConfigured,
|
|
11315
11512
|
isRetrievalSpan,
|
|
11316
11513
|
isRunRecord,
|
|
11317
11514
|
isSandboxSpan,
|
|
@@ -11345,6 +11542,7 @@ export {
|
|
|
11345
11542
|
normalizeScores,
|
|
11346
11543
|
notBlocked,
|
|
11347
11544
|
objectiveEval,
|
|
11545
|
+
otelRunCompleteHook,
|
|
11348
11546
|
pairedBootstrap,
|
|
11349
11547
|
pairedEvalueSequence,
|
|
11350
11548
|
pairedMde,
|
|
@@ -11467,6 +11665,10 @@ export {
|
|
|
11467
11665
|
toolSpans,
|
|
11468
11666
|
traceAnalystFunctionGroup,
|
|
11469
11667
|
traceAnalystOnRunComplete,
|
|
11668
|
+
traceJudge,
|
|
11669
|
+
traceJudgeEnsemble,
|
|
11670
|
+
traceMutator,
|
|
11671
|
+
tracedAnalyzeTraces,
|
|
11470
11672
|
trialTraceFromMultiShotTrial,
|
|
11471
11673
|
typoMutator,
|
|
11472
11674
|
urlContains,
|
|
@@ -11487,6 +11689,7 @@ export {
|
|
|
11487
11689
|
wilcoxonSignedRank,
|
|
11488
11690
|
withAssignedFeedbackSplit,
|
|
11489
11691
|
withJudgeRetry,
|
|
11692
|
+
withOtelPipeline,
|
|
11490
11693
|
wranglerDeployRunner
|
|
11491
11694
|
};
|
|
11492
11695
|
//# sourceMappingURL=index.js.map
|