@tangle-network/agent-eval 0.20.8 → 0.20.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +302 -0
- package/LICENSE +21 -0
- package/README.md +16 -9
- package/dist/benchmarks/index.d.ts +1 -0
- package/dist/benchmarks/index.js +12 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/chunk-42I2QC2L.js +219 -0
- package/dist/chunk-42I2QC2L.js.map +1 -0
- package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} +90 -11
- package/dist/chunk-LSR4IAYN.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/index-1PZOtZFr.d.ts +290 -0
- package/dist/index.d.ts +37 -298
- package/dist/index.js +130 -252
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +502 -0
- package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} +1 -1
- package/dist/telemetry/file.d.ts +1 -1
- package/dist/telemetry/index.d.ts +2 -2
- package/dist/telemetry/index.js.map +1 -1
- package/dist/wire/index.js +1 -1
- package/docs/concepts.md +4 -4
- package/docs/knowledge-readiness.md +2 -2
- package/docs/wire-protocol.md +3 -3
- package/package.json +13 -5
- package/dist/chunk-CJJSB6ZQ.js.map +0 -1
- package/examples/benchmarks/README.md +0 -44
- package/examples/benchmarks/gsm8k/index.ts +0 -126
- package/examples/benchmarks/swebench-lite/index.ts +0 -178
- package/examples/multi-shot-optimization/index.ts +0 -114
- package/examples/same-sandbox-harness/index.ts +0 -63
package/dist/index.js
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
import {
|
|
2
|
+
BENCHMARK_SPLIT_SEED,
|
|
3
|
+
benchmarks_exports,
|
|
4
|
+
deterministicSplit
|
|
5
|
+
} from "./chunk-42I2QC2L.js";
|
|
1
6
|
import {
|
|
2
7
|
LlmCallError,
|
|
3
8
|
LlmClient,
|
|
@@ -6,9 +11,7 @@ import {
|
|
|
6
11
|
probeLlm,
|
|
7
12
|
stripFencedJson
|
|
8
13
|
} from "./chunk-JAOLXRIA.js";
|
|
9
|
-
import
|
|
10
|
-
__export
|
|
11
|
-
} from "./chunk-PZ5AY32C.js";
|
|
14
|
+
import "./chunk-PZ5AY32C.js";
|
|
12
15
|
|
|
13
16
|
// src/client.ts
|
|
14
17
|
var ProductClient = class {
|
|
@@ -649,9 +652,9 @@ function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
|
649
652
|
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
650
653
|
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
651
654
|
}
|
|
652
|
-
async function replayFeedbackTrajectory(trajectory,
|
|
655
|
+
async function replayFeedbackTrajectory(trajectory, adapter) {
|
|
653
656
|
try {
|
|
654
|
-
const result = await
|
|
657
|
+
const result = await adapter.replay(trajectory);
|
|
655
658
|
return {
|
|
656
659
|
trajectoryId: trajectory.id,
|
|
657
660
|
...result
|
|
@@ -680,10 +683,10 @@ async function replayFeedbackTrajectory(trajectory, adapter2) {
|
|
|
680
683
|
};
|
|
681
684
|
}
|
|
682
685
|
}
|
|
683
|
-
async function replayFeedbackTrajectories(trajectories,
|
|
686
|
+
async function replayFeedbackTrajectories(trajectories, adapter) {
|
|
684
687
|
const results = [];
|
|
685
688
|
for (const trajectory of trajectories) {
|
|
686
|
-
results.push(await replayFeedbackTrajectory(trajectory,
|
|
689
|
+
results.push(await replayFeedbackTrajectory(trajectory, adapter));
|
|
687
690
|
}
|
|
688
691
|
return results;
|
|
689
692
|
}
|
|
@@ -2342,7 +2345,7 @@ var DEFAULT_BUDGET = {
|
|
|
2342
2345
|
maxWallMs: 5 * 60 * 1e3
|
|
2343
2346
|
};
|
|
2344
2347
|
async function runAgentControlLoop(config) {
|
|
2345
|
-
const budget =
|
|
2348
|
+
const budget = normalizeBudget(config.budget);
|
|
2346
2349
|
const actionFailure = config.actionFailure ?? "continue";
|
|
2347
2350
|
const controller = new AbortController();
|
|
2348
2351
|
const upstreamAbort = () => controller.abort(config.signal?.reason);
|
|
@@ -2379,12 +2382,13 @@ async function runAgentControlLoop(config) {
|
|
|
2379
2382
|
try {
|
|
2380
2383
|
state = await config.observe({ history, abortSignal: controller.signal });
|
|
2381
2384
|
} catch (err) {
|
|
2382
|
-
|
|
2385
|
+
const error = runtimeError("observe", 0, err);
|
|
2386
|
+
runtimeErrors.push(error);
|
|
2383
2387
|
return finish(emitter, {
|
|
2384
2388
|
intent: config.intent,
|
|
2385
2389
|
pass: false,
|
|
2386
2390
|
completed: false,
|
|
2387
|
-
reason:
|
|
2391
|
+
reason: error.message,
|
|
2388
2392
|
steps: history,
|
|
2389
2393
|
finalState: void 0,
|
|
2390
2394
|
finalEvals: [],
|
|
@@ -2400,12 +2404,13 @@ async function runAgentControlLoop(config) {
|
|
|
2400
2404
|
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
2401
2405
|
await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
|
|
2402
2406
|
} catch (err) {
|
|
2403
|
-
|
|
2407
|
+
const error = runtimeError("validate", 0, err);
|
|
2408
|
+
runtimeErrors.push(error);
|
|
2404
2409
|
return finish(emitter, {
|
|
2405
2410
|
intent: config.intent,
|
|
2406
2411
|
pass: false,
|
|
2407
2412
|
completed: false,
|
|
2408
|
-
reason:
|
|
2413
|
+
reason: error.message,
|
|
2409
2414
|
steps: history,
|
|
2410
2415
|
finalState: state,
|
|
2411
2416
|
finalEvals: [],
|
|
@@ -2575,13 +2580,14 @@ async function runAgentControlLoop(config) {
|
|
|
2575
2580
|
let actionOutcome;
|
|
2576
2581
|
try {
|
|
2577
2582
|
const result = await config.act(decision.action, ctx);
|
|
2578
|
-
const
|
|
2583
|
+
const rawCostUsd = config.getActionCostUsd?.({
|
|
2579
2584
|
action: decision.action,
|
|
2580
2585
|
result,
|
|
2581
2586
|
state,
|
|
2582
2587
|
evals,
|
|
2583
2588
|
history
|
|
2584
2589
|
});
|
|
2590
|
+
const costUsd = normalizeActionCostUsd(rawCostUsd, runtimeErrors, stepIndex);
|
|
2585
2591
|
if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
|
|
2586
2592
|
spentCostUsd += costUsd;
|
|
2587
2593
|
await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
|
|
@@ -2874,6 +2880,34 @@ function objectiveEval(input) {
|
|
|
2874
2880
|
function subjectiveEval(input) {
|
|
2875
2881
|
return { ...input, objective: false };
|
|
2876
2882
|
}
|
|
2883
|
+
function normalizeBudget(input) {
|
|
2884
|
+
const raw = { ...DEFAULT_BUDGET, ...input };
|
|
2885
|
+
if (!Number.isInteger(raw.maxSteps) || raw.maxSteps < 1) {
|
|
2886
|
+
throw new RangeError(`ControlRuntime budget.maxSteps must be an integer >= 1, got ${String(raw.maxSteps)}`);
|
|
2887
|
+
}
|
|
2888
|
+
const budget = { maxSteps: raw.maxSteps };
|
|
2889
|
+
if (raw.maxWallMs !== void 0) {
|
|
2890
|
+
if (typeof raw.maxWallMs !== "number" || !Number.isFinite(raw.maxWallMs) || raw.maxWallMs <= 0) {
|
|
2891
|
+
throw new RangeError(`ControlRuntime budget.maxWallMs must be a positive finite number, got ${String(raw.maxWallMs)}`);
|
|
2892
|
+
}
|
|
2893
|
+
budget.maxWallMs = raw.maxWallMs;
|
|
2894
|
+
}
|
|
2895
|
+
if (raw.maxCostUsd !== void 0) {
|
|
2896
|
+
if (typeof raw.maxCostUsd !== "number" || !Number.isFinite(raw.maxCostUsd) || raw.maxCostUsd < 0) {
|
|
2897
|
+
throw new RangeError(`ControlRuntime budget.maxCostUsd must be a nonnegative finite number, got ${String(raw.maxCostUsd)}`);
|
|
2898
|
+
}
|
|
2899
|
+
budget.maxCostUsd = raw.maxCostUsd;
|
|
2900
|
+
}
|
|
2901
|
+
return budget;
|
|
2902
|
+
}
|
|
2903
|
+
function normalizeActionCostUsd(costUsd, runtimeErrors, stepIndex) {
|
|
2904
|
+
if (costUsd === void 0) return void 0;
|
|
2905
|
+
if (!Number.isFinite(costUsd) || costUsd < 0) {
|
|
2906
|
+
runtimeErrors.push(runtimeError("act", stepIndex, new Error(`invalid action costUsd: ${String(costUsd)}`)));
|
|
2907
|
+
return void 0;
|
|
2908
|
+
}
|
|
2909
|
+
return costUsd;
|
|
2910
|
+
}
|
|
2877
2911
|
function allCriticalPassed(evals) {
|
|
2878
2912
|
return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
|
|
2879
2913
|
}
|
|
@@ -3124,7 +3158,7 @@ function isRequirementMissing(requirement, now) {
|
|
|
3124
3158
|
function isExpired(requirement, now) {
|
|
3125
3159
|
if (!requirement.validUntil) return false;
|
|
3126
3160
|
const deadline = Date.parse(requirement.validUntil);
|
|
3127
|
-
if (!Number.isFinite(deadline)) return
|
|
3161
|
+
if (!Number.isFinite(deadline)) return true;
|
|
3128
3162
|
return deadline <= now.getTime();
|
|
3129
3163
|
}
|
|
3130
3164
|
function isBlockingGap(requirement) {
|
|
@@ -3133,11 +3167,11 @@ function isBlockingGap(requirement) {
|
|
|
3133
3167
|
function chooseRecommendedAction(blocking, nonBlocking) {
|
|
3134
3168
|
const gaps = blocking.length > 0 ? blocking : nonBlocking;
|
|
3135
3169
|
if (gaps.length === 0) return "run_agent";
|
|
3136
|
-
if (
|
|
3137
|
-
if (
|
|
3138
|
-
if (
|
|
3139
|
-
if (
|
|
3140
|
-
if (
|
|
3170
|
+
if (gaps.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
|
|
3171
|
+
if (gaps.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
|
|
3172
|
+
if (gaps.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
|
|
3173
|
+
if (gaps.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
|
|
3174
|
+
if (gaps.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
|
|
3141
3175
|
if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
|
|
3142
3176
|
return "continue_with_caveat";
|
|
3143
3177
|
}
|
|
@@ -4074,12 +4108,15 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
|
4074
4108
|
};
|
|
4075
4109
|
function aggregateRunScore(score, weights = {}) {
|
|
4076
4110
|
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
4077
|
-
return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
4111
|
+
return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, finiteOrZero(score.costUsd)) + w.wallSeconds * Math.max(0, finiteOrZero(score.wallSeconds) / 60);
|
|
4078
4112
|
}
|
|
4079
4113
|
function clamp012(value) {
|
|
4080
4114
|
if (!Number.isFinite(value)) return 0;
|
|
4081
4115
|
return Math.max(0, Math.min(1, value));
|
|
4082
4116
|
}
|
|
4117
|
+
function finiteOrZero(value) {
|
|
4118
|
+
return Number.isFinite(value) ? value : 0;
|
|
4119
|
+
}
|
|
4083
4120
|
|
|
4084
4121
|
// src/run-critic.ts
|
|
4085
4122
|
var DEFAULT_DRIFT_PATTERNS = [
|
|
@@ -4286,13 +4323,15 @@ var AxGepaSteeringOptimizer = class {
|
|
|
4286
4323
|
const compiled = await optimizer.compile(
|
|
4287
4324
|
selector,
|
|
4288
4325
|
train,
|
|
4289
|
-
(
|
|
4326
|
+
({ prediction, example }) => prediction?.variantId === example?.variantId ? 1 : 0,
|
|
4290
4327
|
{
|
|
4291
4328
|
validationExamples: validation,
|
|
4292
4329
|
maxMetricCalls: 64
|
|
4293
4330
|
}
|
|
4294
4331
|
);
|
|
4295
|
-
|
|
4332
|
+
if (compiled.optimizedProgram !== void 0) {
|
|
4333
|
+
selector.applyOptimization(compiled.optimizedProgram);
|
|
4334
|
+
}
|
|
4296
4335
|
return {
|
|
4297
4336
|
...fallback,
|
|
4298
4337
|
backend: "ax-gepa",
|
|
@@ -10410,20 +10449,20 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
10410
10449
|
let durationMs = 0;
|
|
10411
10450
|
const reasonParts = [];
|
|
10412
10451
|
const diagnostics = {};
|
|
10413
|
-
for (const { adapter
|
|
10452
|
+
for (const { adapter, result } of perAdapter) {
|
|
10414
10453
|
status = worst(status, result.status);
|
|
10415
10454
|
if (typeof result.score === "number") {
|
|
10416
10455
|
weightedScoreSum += result.score;
|
|
10417
10456
|
weightCount += 1;
|
|
10418
10457
|
}
|
|
10419
10458
|
durationMs = mergeDuration === "sum" ? durationMs + result.durationMs : Math.max(durationMs, result.durationMs);
|
|
10420
|
-
reasonParts.push(`${
|
|
10459
|
+
reasonParts.push(`${adapter}: ${result.status}`);
|
|
10421
10460
|
for (const f2 of result.findings) {
|
|
10422
10461
|
findings.push({
|
|
10423
10462
|
...f2,
|
|
10424
10463
|
layer: name,
|
|
10425
|
-
message: prefix ? `${prefix(
|
|
10426
|
-
detail: { ...f2.detail ?? {}, adapter
|
|
10464
|
+
message: prefix ? `${prefix(adapter)} ${f2.message}` : f2.message,
|
|
10465
|
+
detail: { ...f2.detail ?? {}, adapter }
|
|
10427
10466
|
});
|
|
10428
10467
|
}
|
|
10429
10468
|
for (const [k, v] of Object.entries(result.diagnostics ?? {})) {
|
|
@@ -10442,8 +10481,8 @@ function mergeLayerResults(name, perAdapter, options = {}) {
|
|
|
10442
10481
|
reason: reasonParts.join(" \xB7 "),
|
|
10443
10482
|
diagnostics: Object.keys(diagnostics).length > 0 ? diagnostics : void 0,
|
|
10444
10483
|
detail: {
|
|
10445
|
-
adapters: perAdapter.map(({ adapter
|
|
10446
|
-
adapter
|
|
10484
|
+
adapters: perAdapter.map(({ adapter, result }) => ({
|
|
10485
|
+
adapter,
|
|
10447
10486
|
status: result.status,
|
|
10448
10487
|
score: result.score ?? null
|
|
10449
10488
|
})),
|
|
@@ -10469,10 +10508,10 @@ function multiToolchainLayer(config) {
|
|
|
10469
10508
|
reason: "no adapters detected"
|
|
10470
10509
|
};
|
|
10471
10510
|
}
|
|
10472
|
-
const runOne = async (
|
|
10473
|
-
const adapterName = config.adapterName(
|
|
10511
|
+
const runOne = async (adapter) => {
|
|
10512
|
+
const adapterName = config.adapterName(adapter);
|
|
10474
10513
|
try {
|
|
10475
|
-
const r = await config.run(
|
|
10514
|
+
const r = await config.run(adapter, ctx);
|
|
10476
10515
|
return { adapter: adapterName, result: r };
|
|
10477
10516
|
} catch (err) {
|
|
10478
10517
|
return {
|
|
@@ -11908,8 +11947,8 @@ function formatPct(value) {
|
|
|
11908
11947
|
function bySplitOrder(a, b) {
|
|
11909
11948
|
return ALL_SPLITS.indexOf(a) - ALL_SPLITS.indexOf(b);
|
|
11910
11949
|
}
|
|
11911
|
-
function runAdapter(
|
|
11912
|
-
return typeof
|
|
11950
|
+
function runAdapter(adapter, scenario, context) {
|
|
11951
|
+
return typeof adapter === "function" ? adapter(scenario, context) : adapter.run(scenario, context);
|
|
11913
11952
|
}
|
|
11914
11953
|
function throwIfAborted(signal) {
|
|
11915
11954
|
if (!signal?.aborted) return;
|
|
@@ -12325,6 +12364,24 @@ function fmt2(x) {
|
|
|
12325
12364
|
}
|
|
12326
12365
|
|
|
12327
12366
|
// src/researcher.ts
|
|
12367
|
+
var CallbackResearcher = class {
|
|
12368
|
+
constructor(callbacks) {
|
|
12369
|
+
this.callbacks = callbacks;
|
|
12370
|
+
}
|
|
12371
|
+
callbacks;
|
|
12372
|
+
inspectFailures(runs) {
|
|
12373
|
+
return this.callbacks.inspectFailures(runs);
|
|
12374
|
+
}
|
|
12375
|
+
proposeChange(failures) {
|
|
12376
|
+
return this.callbacks.proposeChange(failures);
|
|
12377
|
+
}
|
|
12378
|
+
applyChange(changes, baseline) {
|
|
12379
|
+
return this.callbacks.applyChange(changes, baseline);
|
|
12380
|
+
}
|
|
12381
|
+
evaluateChange(plan) {
|
|
12382
|
+
return this.callbacks.evaluateChange(plan);
|
|
12383
|
+
}
|
|
12384
|
+
};
|
|
12328
12385
|
var NoopResearcher = class {
|
|
12329
12386
|
hint;
|
|
12330
12387
|
constructor(hint = "NoopResearcher: no implementation wired") {
|
|
@@ -12777,214 +12834,6 @@ function mean7(xs) {
|
|
|
12777
12834
|
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
12778
12835
|
}
|
|
12779
12836
|
|
|
12780
|
-
// src/benchmarks/types.ts
|
|
12781
|
-
function fnv1a32(input) {
|
|
12782
|
-
let h = 2166136261;
|
|
12783
|
-
for (let i = 0; i < input.length; i++) {
|
|
12784
|
-
h ^= input.charCodeAt(i) & 255;
|
|
12785
|
-
h = h + ((h << 1) + (h << 4) + (h << 7) + (h << 8) + (h << 24)) >>> 0;
|
|
12786
|
-
}
|
|
12787
|
-
return h >>> 0;
|
|
12788
|
-
}
|
|
12789
|
-
var BENCHMARK_SPLIT_SEED = "agent-eval-v1";
|
|
12790
|
-
function deterministicSplit(itemId, seed = BENCHMARK_SPLIT_SEED) {
|
|
12791
|
-
const h = fnv1a32(`${seed}::${itemId}`);
|
|
12792
|
-
const pos = h / 4294967296;
|
|
12793
|
-
if (pos < 0.6) return "search";
|
|
12794
|
-
if (pos < 0.8) return "dev";
|
|
12795
|
-
return "holdout";
|
|
12796
|
-
}
|
|
12797
|
-
|
|
12798
|
-
// src/benchmarks/index.ts
|
|
12799
|
-
var benchmarks_exports = {};
|
|
12800
|
-
__export(benchmarks_exports, {
|
|
12801
|
-
BENCHMARK_SPLIT_SEED: () => BENCHMARK_SPLIT_SEED,
|
|
12802
|
-
deterministicSplit: () => deterministicSplit,
|
|
12803
|
-
routing: () => routing_exports
|
|
12804
|
-
});
|
|
12805
|
-
|
|
12806
|
-
// src/benchmarks/routing/index.ts
|
|
12807
|
-
var routing_exports = {};
|
|
12808
|
-
__export(routing_exports, {
|
|
12809
|
-
ROUTING_DATASET: () => ROUTING_DATASET,
|
|
12810
|
-
RoutingAdapter: () => RoutingAdapter,
|
|
12811
|
-
assignSplit: () => assignSplit,
|
|
12812
|
-
evaluate: () => evaluate,
|
|
12813
|
-
extractRouteTokens: () => extractRouteTokens,
|
|
12814
|
-
loadDataset: () => loadDataset
|
|
12815
|
-
});
|
|
12816
|
-
|
|
12817
|
-
// src/benchmarks/routing/dataset.ts
|
|
12818
|
-
var ROUTING_DATASET = [
|
|
12819
|
-
{
|
|
12820
|
-
id: "file_001",
|
|
12821
|
-
category: "file",
|
|
12822
|
-
prompt: "Save the meeting notes to /tmp/notes-2025-04.md as markdown.",
|
|
12823
|
-
route: "fs.write",
|
|
12824
|
-
synonyms: ["filesystem.write", "write_file"],
|
|
12825
|
-
hardNegatives: ["fs.read", "chat.reply"]
|
|
12826
|
-
},
|
|
12827
|
-
{
|
|
12828
|
-
id: "file_002",
|
|
12829
|
-
category: "file",
|
|
12830
|
-
prompt: "Read the contents of /etc/hosts and summarize the entries.",
|
|
12831
|
-
route: "fs.read",
|
|
12832
|
-
synonyms: ["filesystem.read", "read_file"],
|
|
12833
|
-
hardNegatives: ["fs.write", "search.web"]
|
|
12834
|
-
},
|
|
12835
|
-
{
|
|
12836
|
-
id: "file_003",
|
|
12837
|
-
category: "file",
|
|
12838
|
-
prompt: "List every Python file under src/ recursively.",
|
|
12839
|
-
route: "fs.list",
|
|
12840
|
-
synonyms: ["filesystem.list", "list_files"],
|
|
12841
|
-
hardNegatives: ["fs.read", "search.code"]
|
|
12842
|
-
},
|
|
12843
|
-
{
|
|
12844
|
-
id: "file_004",
|
|
12845
|
-
category: "file",
|
|
12846
|
-
prompt: "Delete the cached build at .turbo/cache.",
|
|
12847
|
-
route: "fs.delete",
|
|
12848
|
-
synonyms: ["filesystem.delete", "remove_file"],
|
|
12849
|
-
hardNegatives: ["fs.write", "fs.list"]
|
|
12850
|
-
},
|
|
12851
|
-
{
|
|
12852
|
-
id: "math_001",
|
|
12853
|
-
category: "math",
|
|
12854
|
-
prompt: "What is the integral of 3x^2 + 2x from 0 to 5?",
|
|
12855
|
-
route: "math.integral",
|
|
12856
|
-
synonyms: ["calculator.integral", "math.solve"],
|
|
12857
|
-
hardNegatives: ["math.derivative", "chat.reply"]
|
|
12858
|
-
},
|
|
12859
|
-
{
|
|
12860
|
-
id: "math_002",
|
|
12861
|
-
category: "math",
|
|
12862
|
-
prompt: "Compute the derivative of sin(x) * cos(x).",
|
|
12863
|
-
route: "math.derivative",
|
|
12864
|
-
synonyms: ["calculator.derivative", "math.solve"],
|
|
12865
|
-
hardNegatives: ["math.integral", "math.algebra"]
|
|
12866
|
-
},
|
|
12867
|
-
{
|
|
12868
|
-
id: "math_003",
|
|
12869
|
-
category: "math",
|
|
12870
|
-
prompt: "Solve 2x + 7 = 19 for x.",
|
|
12871
|
-
route: "math.algebra",
|
|
12872
|
-
synonyms: ["calculator.algebra", "math.solve"],
|
|
12873
|
-
hardNegatives: ["math.derivative", "math.integral"]
|
|
12874
|
-
},
|
|
12875
|
-
{
|
|
12876
|
-
id: "math_004",
|
|
12877
|
-
category: "math",
|
|
12878
|
-
prompt: "What is the prime factorization of 360?",
|
|
12879
|
-
route: "math.numbertheory",
|
|
12880
|
-
synonyms: ["calculator.factor", "math.solve"],
|
|
12881
|
-
hardNegatives: ["math.algebra", "search.web"]
|
|
12882
|
-
},
|
|
12883
|
-
{
|
|
12884
|
-
id: "search_001",
|
|
12885
|
-
category: "search",
|
|
12886
|
-
prompt: "Find recent papers on agent prompt optimization with held-out promotion gates.",
|
|
12887
|
-
route: "search.web",
|
|
12888
|
-
synonyms: ["web.search", "search.papers"],
|
|
12889
|
-
hardNegatives: ["search.code", "chat.reply"]
|
|
12890
|
-
},
|
|
12891
|
-
{
|
|
12892
|
-
id: "search_002",
|
|
12893
|
-
category: "search",
|
|
12894
|
-
prompt: "Search the codebase for every call site of `runProposeReview`.",
|
|
12895
|
-
route: "search.code",
|
|
12896
|
-
synonyms: ["code.search", "grep"],
|
|
12897
|
-
hardNegatives: ["search.web", "fs.read"]
|
|
12898
|
-
},
|
|
12899
|
-
{
|
|
12900
|
-
id: "search_003",
|
|
12901
|
-
category: "search",
|
|
12902
|
-
prompt: "What is the latest release of the Tangle network on GitHub?",
|
|
12903
|
-
route: "search.web",
|
|
12904
|
-
synonyms: ["web.search", "github.releases"],
|
|
12905
|
-
hardNegatives: ["search.code", "chat.reply"]
|
|
12906
|
-
},
|
|
12907
|
-
{
|
|
12908
|
-
id: "search_004",
|
|
12909
|
-
category: "search",
|
|
12910
|
-
prompt: "Find all TODO comments in the agent-eval src tree.",
|
|
12911
|
-
route: "search.code",
|
|
12912
|
-
synonyms: ["code.search", "grep"],
|
|
12913
|
-
hardNegatives: ["search.web", "fs.list"]
|
|
12914
|
-
},
|
|
12915
|
-
{
|
|
12916
|
-
id: "chat_001",
|
|
12917
|
-
category: "chat",
|
|
12918
|
-
prompt: "Hi there, how are you doing today?",
|
|
12919
|
-
route: "chat.reply",
|
|
12920
|
-
synonyms: ["conversation.reply"],
|
|
12921
|
-
hardNegatives: ["search.web", "fs.read"]
|
|
12922
|
-
},
|
|
12923
|
-
{
|
|
12924
|
-
id: "chat_002",
|
|
12925
|
-
category: "chat",
|
|
12926
|
-
prompt: "Please explain the difference between an LLM and a foundation model.",
|
|
12927
|
-
route: "chat.reply",
|
|
12928
|
-
synonyms: ["conversation.reply", "qa.answer"],
|
|
12929
|
-
hardNegatives: ["search.web", "math.algebra"]
|
|
12930
|
-
},
|
|
12931
|
-
{
|
|
12932
|
-
id: "chat_003",
|
|
12933
|
-
category: "chat",
|
|
12934
|
-
prompt: "Tell me a short joke about distributed systems.",
|
|
12935
|
-
route: "chat.reply",
|
|
12936
|
-
synonyms: ["conversation.reply"],
|
|
12937
|
-
hardNegatives: ["search.web", "fs.read"]
|
|
12938
|
-
},
|
|
12939
|
-
{
|
|
12940
|
-
id: "chat_004",
|
|
12941
|
-
category: "chat",
|
|
12942
|
-
prompt: "Acknowledge my last message with a thumbs up.",
|
|
12943
|
-
route: "chat.reply",
|
|
12944
|
-
synonyms: ["conversation.reply", "react"],
|
|
12945
|
-
hardNegatives: ["fs.write", "search.web"]
|
|
12946
|
-
}
|
|
12947
|
-
];
|
|
12948
|
-
|
|
12949
|
-
// src/benchmarks/routing/index.ts
|
|
12950
|
-
var RoutingAdapter = class {
|
|
12951
|
-
async loadDataset(split) {
|
|
12952
|
-
return ROUTING_DATASET.map((item) => ({ id: item.id, payload: item })).filter((it) => assignSplitImpl(it.id) === split);
|
|
12953
|
-
}
|
|
12954
|
-
async evaluate(item, response) {
|
|
12955
|
-
const tokens2 = extractRouteTokens(response);
|
|
12956
|
-
const correct = new Set([item.payload.route, ...item.payload.synonyms].map((s) => s.toLowerCase()));
|
|
12957
|
-
const hardNeg = new Set(item.payload.hardNegatives.map((s) => s.toLowerCase()));
|
|
12958
|
-
const firstMatch = tokens2.find((t) => correct.has(t.toLowerCase())) ?? null;
|
|
12959
|
-
const firstHardNeg = tokens2.find((t) => hardNeg.has(t.toLowerCase())) ?? null;
|
|
12960
|
-
const score = firstMatch ? 1 : 0;
|
|
12961
|
-
return {
|
|
12962
|
-
score,
|
|
12963
|
-
raw: {
|
|
12964
|
-
firstToken: tokens2[0] ?? null,
|
|
12965
|
-
matchedRoute: firstMatch,
|
|
12966
|
-
hitHardNegative: Boolean(firstHardNeg),
|
|
12967
|
-
hardNegativeRoute: firstHardNeg,
|
|
12968
|
-
category: item.payload.category
|
|
12969
|
-
}
|
|
12970
|
-
};
|
|
12971
|
-
}
|
|
12972
|
-
assignSplit(itemId) {
|
|
12973
|
-
return assignSplitImpl(itemId);
|
|
12974
|
-
}
|
|
12975
|
-
};
|
|
12976
|
-
function assignSplitImpl(itemId) {
|
|
12977
|
-
return deterministicSplit(`routing::${itemId}`);
|
|
12978
|
-
}
|
|
12979
|
-
function extractRouteTokens(response) {
|
|
12980
|
-
const matches2 = response.match(/[a-z][a-z0-9_]*\.[a-z][a-z0-9_]*/gi);
|
|
12981
|
-
return matches2 ?? [];
|
|
12982
|
-
}
|
|
12983
|
-
var adapter = new RoutingAdapter();
|
|
12984
|
-
var loadDataset = adapter.loadDataset.bind(adapter);
|
|
12985
|
-
var evaluate = adapter.evaluate.bind(adapter);
|
|
12986
|
-
var assignSplit = adapter.assignSplit.bind(adapter);
|
|
12987
|
-
|
|
12988
12837
|
// src/reference-replay-steering.ts
|
|
12989
12838
|
function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
12990
12839
|
const rows = [];
|
|
@@ -14517,7 +14366,13 @@ var TRACE_ANALYST_TRUNCATION_MARKER_PREFIX = "[trace-analyst truncated:";
|
|
|
14517
14366
|
|
|
14518
14367
|
// src/trace-analyst/store.ts
|
|
14519
14368
|
function compileSearchRegex(pattern) {
|
|
14520
|
-
|
|
14369
|
+
let source = pattern;
|
|
14370
|
+
let flags = "m";
|
|
14371
|
+
if (source.startsWith("(?i)")) {
|
|
14372
|
+
source = source.slice(4);
|
|
14373
|
+
flags += "i";
|
|
14374
|
+
}
|
|
14375
|
+
return new RegExp(source, flags);
|
|
14521
14376
|
}
|
|
14522
14377
|
function truncateForBudget(value, byteCap) {
|
|
14523
14378
|
const original = Buffer.byteLength(value, "utf8");
|
|
@@ -14690,19 +14545,26 @@ var OtlpFileTraceStore = class {
|
|
|
14690
14545
|
const buf = await this.buffer();
|
|
14691
14546
|
const hits = [];
|
|
14692
14547
|
let total = 0;
|
|
14548
|
+
let capped = false;
|
|
14693
14549
|
for (const s of trace.spans) {
|
|
14694
|
-
const
|
|
14550
|
+
const remaining = max_matches - hits.length;
|
|
14551
|
+
const localHits = await this.scanSpanForMatches(buf, trace.trace_id, s, re, this.perMatchTextBudget, remaining);
|
|
14695
14552
|
total += localHits.total;
|
|
14696
14553
|
for (const h of localHits.records) {
|
|
14697
14554
|
if (hits.length >= max_matches) break;
|
|
14698
14555
|
hits.push(h);
|
|
14699
14556
|
}
|
|
14557
|
+
if (hits.length >= max_matches) {
|
|
14558
|
+
capped = true;
|
|
14559
|
+
total = Math.max(total, hits.length + 1);
|
|
14560
|
+
break;
|
|
14561
|
+
}
|
|
14700
14562
|
}
|
|
14701
14563
|
return {
|
|
14702
14564
|
trace_id: trace.trace_id,
|
|
14703
14565
|
hits,
|
|
14704
14566
|
total_matches: total,
|
|
14705
|
-
has_more: total > hits.length
|
|
14567
|
+
has_more: capped || total > hits.length
|
|
14706
14568
|
};
|
|
14707
14569
|
}
|
|
14708
14570
|
async searchSpan(opts) {
|
|
@@ -14719,14 +14581,13 @@ var OtlpFileTraceStore = class {
|
|
|
14719
14581
|
}
|
|
14720
14582
|
const re = compileSearchRegex(opts.regex_pattern);
|
|
14721
14583
|
const buf = await this.buffer();
|
|
14722
|
-
const localHits = await this.scanSpanForMatches(buf, trace.trace_id, span, re, this.perMatchTextBudget);
|
|
14723
|
-
const truncated = localHits.records.slice(0, max_matches);
|
|
14584
|
+
const localHits = await this.scanSpanForMatches(buf, trace.trace_id, span, re, this.perMatchTextBudget, max_matches);
|
|
14724
14585
|
return {
|
|
14725
14586
|
trace_id: trace.trace_id,
|
|
14726
14587
|
span_id: span.span_id,
|
|
14727
|
-
hits:
|
|
14588
|
+
hits: localHits.records,
|
|
14728
14589
|
total_matches: localHits.total,
|
|
14729
|
-
has_more: localHits.total >
|
|
14590
|
+
has_more: localHits.total > localHits.records.length
|
|
14730
14591
|
};
|
|
14731
14592
|
}
|
|
14732
14593
|
// ─── Index building ────────────────────────────────────────────────
|
|
@@ -14958,15 +14819,20 @@ var OtlpFileTraceStore = class {
|
|
|
14958
14819
|
error_span_count: errorCount
|
|
14959
14820
|
};
|
|
14960
14821
|
}
|
|
14961
|
-
async scanSpanForMatches(buf, trace_id, s, re, textBudget) {
|
|
14822
|
+
async scanSpanForMatches(buf, trace_id, s, re, textBudget, recordCap) {
|
|
14962
14823
|
const slice = buf.subarray(s.line_byte_offset, s.line_byte_offset + s.line_byte_length).toString("utf8");
|
|
14963
14824
|
const records = [];
|
|
14964
14825
|
const globalRe = new RegExp(re.source, re.flags.includes("g") ? re.flags : `${re.flags}g`);
|
|
14965
14826
|
let total = 0;
|
|
14827
|
+
let hasMore = false;
|
|
14966
14828
|
let m;
|
|
14967
14829
|
while ((m = globalRe.exec(slice)) !== null) {
|
|
14968
14830
|
total += 1;
|
|
14969
14831
|
if (m.index === globalRe.lastIndex) globalRe.lastIndex += 1;
|
|
14832
|
+
if (records.length >= recordCap) {
|
|
14833
|
+
hasMore = true;
|
|
14834
|
+
break;
|
|
14835
|
+
}
|
|
14970
14836
|
const before = slice.slice(Math.max(0, m.index - textBudget / 2), m.index);
|
|
14971
14837
|
const after = slice.slice(
|
|
14972
14838
|
m.index + m[0].length,
|
|
@@ -14984,7 +14850,7 @@ var OtlpFileTraceStore = class {
|
|
|
14984
14850
|
match_offset: m.index
|
|
14985
14851
|
});
|
|
14986
14852
|
}
|
|
14987
|
-
return { records, total };
|
|
14853
|
+
return { records, total, hasMore };
|
|
14988
14854
|
}
|
|
14989
14855
|
};
|
|
14990
14856
|
var TraceFileMissingError = class extends Error {
|
|
@@ -15436,11 +15302,22 @@ async function analyzeTraces(input, options) {
|
|
|
15436
15302
|
findings: Array.isArray(result.findings) ? result.findings.filter((s) => typeof s === "string") : [],
|
|
15437
15303
|
turns,
|
|
15438
15304
|
turnCount: turns.length,
|
|
15439
|
-
usage: analyst.getUsage(),
|
|
15440
|
-
chatLog: analyst.getChatLog(),
|
|
15305
|
+
usage: normalizeRoleArrays(analyst.getUsage()),
|
|
15306
|
+
chatLog: normalizeRoleArrays(analyst.getChatLog()),
|
|
15441
15307
|
actorPromptVersion: TRACE_ANALYST_ACTOR_DESCRIPTION_VERSION
|
|
15442
15308
|
};
|
|
15443
15309
|
}
|
|
15310
|
+
function normalizeRoleArrays(value) {
|
|
15311
|
+
const record = value && typeof value === "object" ? value : {};
|
|
15312
|
+
return {
|
|
15313
|
+
actor: normalizeRecordArray(record.actor),
|
|
15314
|
+
responder: normalizeRecordArray(record.responder)
|
|
15315
|
+
};
|
|
15316
|
+
}
|
|
15317
|
+
function normalizeRecordArray(value) {
|
|
15318
|
+
if (!Array.isArray(value)) return [];
|
|
15319
|
+
return value.map((item) => item && typeof item === "object" ? { ...item } : { value: item });
|
|
15320
|
+
}
|
|
15444
15321
|
|
|
15445
15322
|
// src/trace-analyst/insights.ts
|
|
15446
15323
|
var DOMAIN_STOP_WORDS = /* @__PURE__ */ new Set([
|
|
@@ -15696,6 +15573,7 @@ export {
|
|
|
15696
15573
|
BudgetBreachError,
|
|
15697
15574
|
BudgetGuard,
|
|
15698
15575
|
BuilderSession,
|
|
15576
|
+
CallbackResearcher,
|
|
15699
15577
|
ConvergenceTracker,
|
|
15700
15578
|
CostLedger,
|
|
15701
15579
|
CostTracker,
|