@tangle-network/agent-eval 0.70.0 → 0.72.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/http.js.map +1 -1
- package/dist/campaign/index.d.ts +10 -0
- package/dist/campaign/index.js +48 -11
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-ZZCQQHW7.js → chunk-4QJN7RDX.js} +4 -4
- package/dist/chunk-4QJN7RDX.js.map +1 -0
- package/dist/{chunk-3B7Y5AUR.js → chunk-GWGO2K6Y.js} +3 -2
- package/dist/chunk-GWGO2K6Y.js.map +1 -0
- package/dist/{chunk-Z4ZCBC7M.js → chunk-ODGETRTM.js} +4 -3
- package/dist/chunk-ODGETRTM.js.map +1 -0
- package/dist/chunk-SL55X4VN.js +186 -0
- package/dist/chunk-SL55X4VN.js.map +1 -0
- package/dist/{chunk-GYELOWB6.js → chunk-UD6EF73X.js} +3 -3
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.js +3 -3
- package/dist/index.js +31 -171
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +1 -1
- package/dist/rl.d.ts +155 -1
- package/dist/rl.js +195 -6
- package/dist/rl.js.map +1 -1
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/dist/traces.js +1 -1
- package/package.json +1 -1
- package/dist/chunk-3B7Y5AUR.js.map +0 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- package/dist/chunk-Z4ZCBC7M.js.map +0 -1
- package/dist/chunk-ZZCQQHW7.js.map +0 -1
- /package/dist/{chunk-GYELOWB6.js.map → chunk-UD6EF73X.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
MODEL_PRICING,
|
|
3
|
+
MetricsCollector,
|
|
4
|
+
TokenCounter,
|
|
5
|
+
agentProfileHash,
|
|
6
|
+
estimateCost,
|
|
7
|
+
estimateTokens,
|
|
8
|
+
isModelPriced,
|
|
9
|
+
resolveModelPricing
|
|
10
|
+
} from "./chunk-SL55X4VN.js";
|
|
4
11
|
import {
|
|
5
12
|
HoldoutAuditor,
|
|
6
13
|
canaryLeakView,
|
|
@@ -31,12 +38,12 @@ import {
|
|
|
31
38
|
scoreRedTeamOutput,
|
|
32
39
|
surfaceContentHash,
|
|
33
40
|
toolNamesForRun
|
|
34
|
-
} from "./chunk-
|
|
41
|
+
} from "./chunk-4QJN7RDX.js";
|
|
35
42
|
import {
|
|
36
43
|
BackendIntegrityError,
|
|
37
44
|
assertRealBackend,
|
|
38
45
|
summarizeBackendIntegrity
|
|
39
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-ZPSKPT3V.js";
|
|
40
47
|
import {
|
|
41
48
|
BENCHMARK_SPLIT_SEED,
|
|
42
49
|
benchmarks_exports,
|
|
@@ -49,7 +56,7 @@ import {
|
|
|
49
56
|
computeToolUseMetrics,
|
|
50
57
|
iqr,
|
|
51
58
|
welchsTTest
|
|
52
|
-
} from "./chunk-
|
|
59
|
+
} from "./chunk-GWGO2K6Y.js";
|
|
53
60
|
import {
|
|
54
61
|
exportTrainingData,
|
|
55
62
|
toNdjson
|
|
@@ -204,7 +211,7 @@ import {
|
|
|
204
211
|
tokenizeDomainWords,
|
|
205
212
|
traceAnalystFunctionGroup,
|
|
206
213
|
traceAnalystOnRunComplete
|
|
207
|
-
} from "./chunk-
|
|
214
|
+
} from "./chunk-ODGETRTM.js";
|
|
208
215
|
import {
|
|
209
216
|
DEFAULT_REDACTION_RULES,
|
|
210
217
|
REDACTION_VERSION,
|
|
@@ -2595,14 +2602,15 @@ async function executeScenario(tc, scenario, config) {
|
|
|
2595
2602
|
const content = resp.choices?.[0]?.message?.content ?? "";
|
|
2596
2603
|
messages.push({ role: "assistant", content });
|
|
2597
2604
|
const codeRe = /```(\w+)?\n([\s\S]*?)```/g;
|
|
2598
|
-
let codeMatch;
|
|
2599
|
-
while (
|
|
2605
|
+
let codeMatch = codeRe.exec(content);
|
|
2606
|
+
while (codeMatch !== null) {
|
|
2600
2607
|
allCodeBlocks.push({ language: codeMatch[1] ?? "text", code: codeMatch[2] ?? "" });
|
|
2608
|
+
codeMatch = codeRe.exec(content);
|
|
2601
2609
|
}
|
|
2602
2610
|
const turnBlocks = [];
|
|
2603
|
-
let blockMatch;
|
|
2604
2611
|
const blockReLocal = new RegExp(blockRe.source, blockRe.flags);
|
|
2605
|
-
|
|
2612
|
+
let blockMatch = blockReLocal.exec(content);
|
|
2613
|
+
while (blockMatch !== null) {
|
|
2606
2614
|
const fields = {};
|
|
2607
2615
|
for (const line of (blockMatch[2] ?? "").split("\n")) {
|
|
2608
2616
|
const idx = line.indexOf(":");
|
|
@@ -2611,15 +2619,17 @@ async function executeScenario(tc, scenario, config) {
|
|
|
2611
2619
|
const blockType = blockMatch[1] ?? "";
|
|
2612
2620
|
allBlocks.push({ type: blockType, fields });
|
|
2613
2621
|
turnBlocks.push({ type: blockType, title: fields.title ?? "" });
|
|
2622
|
+
blockMatch = blockReLocal.exec(content);
|
|
2614
2623
|
}
|
|
2615
2624
|
let hasToolCall = false;
|
|
2616
2625
|
if (config.toolCallPatterns) {
|
|
2617
2626
|
for (const pattern of config.toolCallPatterns) {
|
|
2618
2627
|
const re = new RegExp(pattern.source, pattern.flags);
|
|
2619
|
-
let toolMatch;
|
|
2620
|
-
while (
|
|
2628
|
+
let toolMatch = re.exec(content);
|
|
2629
|
+
while (toolMatch !== null) {
|
|
2621
2630
|
allToolCalls.push(toolMatch[0]);
|
|
2622
2631
|
hasToolCall = true;
|
|
2632
|
+
toolMatch = re.exec(content);
|
|
2623
2633
|
}
|
|
2624
2634
|
}
|
|
2625
2635
|
}
|
|
@@ -2941,14 +2951,15 @@ var ProductClient = class {
|
|
|
2941
2951
|
}
|
|
2942
2952
|
}
|
|
2943
2953
|
const blockRe = /:::(\w+)\s*\n([\s\S]*?)\n\s*:::/g;
|
|
2944
|
-
let match;
|
|
2945
|
-
while (
|
|
2954
|
+
let match = blockRe.exec(text);
|
|
2955
|
+
while (match !== null) {
|
|
2946
2956
|
const fields = {};
|
|
2947
2957
|
for (const line of match[2].split("\n")) {
|
|
2948
2958
|
const idx = line.indexOf(":");
|
|
2949
2959
|
if (idx > 0) fields[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
|
|
2950
2960
|
}
|
|
2951
2961
|
blocks.push({ type: match[1], title: fields.title ?? "" });
|
|
2962
|
+
match = blockRe.exec(text);
|
|
2952
2963
|
}
|
|
2953
2964
|
return { text, blocks };
|
|
2954
2965
|
}
|
|
@@ -3089,158 +3100,6 @@ var ConvergenceTracker = class {
|
|
|
3089
3100
|
}
|
|
3090
3101
|
};
|
|
3091
3102
|
|
|
3092
|
-
// src/metrics.ts
|
|
3093
|
-
var MODEL_PRICING = {
|
|
3094
|
-
"gpt-4o": { input: 25e-4, output: 0.01 },
|
|
3095
|
-
"gpt-4o-mini": { input: 15e-5, output: 6e-4 },
|
|
3096
|
-
"gpt-4-turbo": { input: 0.01, output: 0.03 },
|
|
3097
|
-
"claude-sonnet-4-20250514": { input: 3e-3, output: 0.015 },
|
|
3098
|
-
"claude-opus-4-20250514": { input: 0.015, output: 0.075 },
|
|
3099
|
-
"claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
|
|
3100
|
-
};
|
|
3101
|
-
var FAMILY_PRICING = [
|
|
3102
|
-
[/claude.*opus/, { input: 0.015, output: 0.075 }],
|
|
3103
|
-
[/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
|
|
3104
|
-
[/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
|
|
3105
|
-
[/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
|
|
3106
|
-
[/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
|
|
3107
|
-
[/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
|
|
3108
|
-
[/deepseek/, { input: 3e-4, output: 11e-4 }],
|
|
3109
|
-
[/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
|
|
3110
|
-
[/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
|
|
3111
|
-
[/qwen/, { input: 4e-4, output: 12e-4 }],
|
|
3112
|
-
[/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
|
|
3113
|
-
[/gemini/, { input: 125e-5, output: 5e-3 }],
|
|
3114
|
-
[/llama/, { input: 2e-4, output: 6e-4 }]
|
|
3115
|
-
];
|
|
3116
|
-
function normalizeModelId(model) {
|
|
3117
|
-
return (model.split("@")[0] ?? model).trim().toLowerCase();
|
|
3118
|
-
}
|
|
3119
|
-
function resolveModelPricing(model) {
|
|
3120
|
-
if (MODEL_PRICING[model]) return MODEL_PRICING[model];
|
|
3121
|
-
const id = normalizeModelId(model);
|
|
3122
|
-
if (MODEL_PRICING[id]) return MODEL_PRICING[id];
|
|
3123
|
-
for (const [pattern, price] of FAMILY_PRICING) {
|
|
3124
|
-
if (pattern.test(id)) return price;
|
|
3125
|
-
}
|
|
3126
|
-
return null;
|
|
3127
|
-
}
|
|
3128
|
-
function isModelPriced(model) {
|
|
3129
|
-
return resolveModelPricing(model) !== null;
|
|
3130
|
-
}
|
|
3131
|
-
var warnedUnpricedModels = /* @__PURE__ */ new Set();
|
|
3132
|
-
function estimateTokens(text) {
|
|
3133
|
-
return Math.ceil(text.length / 4);
|
|
3134
|
-
}
|
|
3135
|
-
function estimateCost(inputTokens, outputTokens, model) {
|
|
3136
|
-
const pricing = resolveModelPricing(model);
|
|
3137
|
-
if (!pricing) {
|
|
3138
|
-
if (!warnedUnpricedModels.has(model)) {
|
|
3139
|
-
warnedUnpricedModels.add(model);
|
|
3140
|
-
console.warn(
|
|
3141
|
-
`estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
|
|
3142
|
-
);
|
|
3143
|
-
}
|
|
3144
|
-
return 0;
|
|
3145
|
-
}
|
|
3146
|
-
return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
|
|
3147
|
-
}
|
|
3148
|
-
var TokenCounter = class {
|
|
3149
|
-
totalInput = 0;
|
|
3150
|
-
totalOutput = 0;
|
|
3151
|
-
totalCost = 0;
|
|
3152
|
-
model;
|
|
3153
|
-
constructor(model = "gpt-4o") {
|
|
3154
|
-
this.model = model;
|
|
3155
|
-
}
|
|
3156
|
-
/** Record tokens for a turn, returns per-turn cost */
|
|
3157
|
-
record(inputTokens, outputTokens) {
|
|
3158
|
-
this.totalInput += inputTokens;
|
|
3159
|
-
this.totalOutput += outputTokens;
|
|
3160
|
-
const cost = estimateCost(inputTokens, outputTokens, this.model);
|
|
3161
|
-
this.totalCost += cost;
|
|
3162
|
-
return cost;
|
|
3163
|
-
}
|
|
3164
|
-
/** Estimate and record from raw text */
|
|
3165
|
-
recordFromText(inputText, outputText) {
|
|
3166
|
-
const inputTokens = estimateTokens(inputText);
|
|
3167
|
-
const outputTokens = estimateTokens(outputText);
|
|
3168
|
-
const cost = this.record(inputTokens, outputTokens);
|
|
3169
|
-
return { inputTokens, outputTokens, cost };
|
|
3170
|
-
}
|
|
3171
|
-
getTotalInput() {
|
|
3172
|
-
return this.totalInput;
|
|
3173
|
-
}
|
|
3174
|
-
getTotalOutput() {
|
|
3175
|
-
return this.totalOutput;
|
|
3176
|
-
}
|
|
3177
|
-
getTotalCost() {
|
|
3178
|
-
return this.totalCost;
|
|
3179
|
-
}
|
|
3180
|
-
};
|
|
3181
|
-
var MetricsCollector = class {
|
|
3182
|
-
client;
|
|
3183
|
-
workspaceId;
|
|
3184
|
-
metrics = [];
|
|
3185
|
-
constructor(client, workspaceId) {
|
|
3186
|
-
this.client = client;
|
|
3187
|
-
this.workspaceId = workspaceId;
|
|
3188
|
-
}
|
|
3189
|
-
/** Collect metrics after a turn completes */
|
|
3190
|
-
async collect(turn, responseLatencyMs, responseChars, codeBlocksProduced, blocksExtracted, completionCriteriaMet, completionCriteriaTotal, qualityScore, inputTokens = 0, outputTokens = 0, estimatedCostUsd = 0) {
|
|
3191
|
-
const state = await this.getState();
|
|
3192
|
-
const m = {
|
|
3193
|
-
turn,
|
|
3194
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3195
|
-
tasks: state.tasks,
|
|
3196
|
-
events: state.events,
|
|
3197
|
-
proposals: state.proposals,
|
|
3198
|
-
vaultFiles: state.vaultFiles.length,
|
|
3199
|
-
responseLatencyMs,
|
|
3200
|
-
responseChars,
|
|
3201
|
-
codeBlocksProduced,
|
|
3202
|
-
blocksExtracted,
|
|
3203
|
-
qualityScore,
|
|
3204
|
-
inputTokens,
|
|
3205
|
-
outputTokens,
|
|
3206
|
-
estimatedCostUsd,
|
|
3207
|
-
totalCostUsd: estimatedCostUsd,
|
|
3208
|
-
completionPercent: completionCriteriaTotal > 0 ? completionCriteriaMet / completionCriteriaTotal * 100 : 0
|
|
3209
|
-
};
|
|
3210
|
-
this.metrics.push(m);
|
|
3211
|
-
return m;
|
|
3212
|
-
}
|
|
3213
|
-
/** Get current product state */
|
|
3214
|
-
async getState() {
|
|
3215
|
-
const [tasks, events, approvals, vaultFiles] = await Promise.all([
|
|
3216
|
-
this.client.getTasks(this.workspaceId),
|
|
3217
|
-
this.client.getEvents(this.workspaceId),
|
|
3218
|
-
this.client.getApprovals(this.workspaceId),
|
|
3219
|
-
this.client.getVaultTree(this.workspaceId)
|
|
3220
|
-
]);
|
|
3221
|
-
return {
|
|
3222
|
-
tasks: tasks.length,
|
|
3223
|
-
events: events.length,
|
|
3224
|
-
proposals: {
|
|
3225
|
-
pending: approvals.filter((a) => a.status === "pending").length,
|
|
3226
|
-
approved: approvals.filter((a) => a.status === "approved").length,
|
|
3227
|
-
rejected: approvals.filter((a) => a.status === "rejected").length
|
|
3228
|
-
},
|
|
3229
|
-
vaultFiles,
|
|
3230
|
-
codeBlocks: 0,
|
|
3231
|
-
generations: 0
|
|
3232
|
-
};
|
|
3233
|
-
}
|
|
3234
|
-
/** Get all collected metrics */
|
|
3235
|
-
getMetrics() {
|
|
3236
|
-
return [...this.metrics];
|
|
3237
|
-
}
|
|
3238
|
-
/** Get convergence curve (completion% over turns) */
|
|
3239
|
-
getConvergenceCurve() {
|
|
3240
|
-
return this.metrics.map((m) => m.completionPercent);
|
|
3241
|
-
}
|
|
3242
|
-
};
|
|
3243
|
-
|
|
3244
3103
|
// src/driver.ts
|
|
3245
3104
|
var RIGOR_STANCE = {
|
|
3246
3105
|
cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
|
|
@@ -4680,8 +4539,8 @@ function analyzeAntiSlop(outputs, config) {
|
|
|
4680
4539
|
const lower = output.toLowerCase();
|
|
4681
4540
|
for (const phrase of config.bannedPhrases) {
|
|
4682
4541
|
const needle = phrase.toLowerCase();
|
|
4683
|
-
let idx = 0;
|
|
4684
|
-
while (
|
|
4542
|
+
let idx = lower.indexOf(needle, 0);
|
|
4543
|
+
while (idx !== -1) {
|
|
4685
4544
|
counts.banned_phrase += 1;
|
|
4686
4545
|
if (issues.length < 20) {
|
|
4687
4546
|
issues.push({
|
|
@@ -4690,7 +4549,7 @@ function analyzeAntiSlop(outputs, config) {
|
|
|
4690
4549
|
example: snippet(output, idx, phrase.length)
|
|
4691
4550
|
});
|
|
4692
4551
|
}
|
|
4693
|
-
idx
|
|
4552
|
+
idx = lower.indexOf(needle, idx + needle.length);
|
|
4694
4553
|
}
|
|
4695
4554
|
}
|
|
4696
4555
|
for (const re of config.bannedOpenings) {
|
|
@@ -8623,13 +8482,14 @@ function extractAssetUrls(html, baseUrl) {
|
|
|
8623
8482
|
const linkRe = /<link\b[^>]*\bhref\s*=\s*["']([^"']+)["'][^>]*>/gi;
|
|
8624
8483
|
const scriptRe = /<script\b[^>]*\bsrc\s*=\s*["']([^"']+)["'][^>]*>/gi;
|
|
8625
8484
|
for (const re of [linkRe, scriptRe]) {
|
|
8626
|
-
let match;
|
|
8627
|
-
while (
|
|
8485
|
+
let match = re.exec(html);
|
|
8486
|
+
while (match !== null) {
|
|
8628
8487
|
const raw = match[1];
|
|
8629
8488
|
try {
|
|
8630
8489
|
urls.add(new URL(raw, baseUrl).toString());
|
|
8631
8490
|
} catch {
|
|
8632
8491
|
}
|
|
8492
|
+
match = re.exec(html);
|
|
8633
8493
|
}
|
|
8634
8494
|
}
|
|
8635
8495
|
return Array.from(urls);
|