@tangle-network/agent-eval 0.71.0 → 0.72.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/dist/campaign/index.js +25 -12
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-VMAYE3LM.js → chunk-4QJN7RDX.js} +3 -3
- package/dist/chunk-SL55X4VN.js +186 -0
- package/dist/chunk-SL55X4VN.js.map +1 -0
- package/dist/{chunk-6QZUCFKM.js → chunk-UD6EF73X.js} +3 -3
- package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
- package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
- package/dist/contract/index.js +3 -3
- package/dist/index.js +11 -156
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-PQV2TKC3.js +0 -27
- package/dist/chunk-PQV2TKC3.js.map +0 -1
- /package/dist/{chunk-VMAYE3LM.js.map → chunk-4QJN7RDX.js.map} +0 -0
- /package/dist/{chunk-6QZUCFKM.js.map → chunk-UD6EF73X.js.map} +0 -0
- /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
|
|
2
|
+
MODEL_PRICING,
|
|
3
|
+
MetricsCollector,
|
|
4
|
+
TokenCounter,
|
|
5
|
+
agentProfileHash,
|
|
6
|
+
estimateCost,
|
|
7
|
+
estimateTokens,
|
|
8
|
+
isModelPriced,
|
|
9
|
+
resolveModelPricing
|
|
10
|
+
} from "./chunk-SL55X4VN.js";
|
|
4
11
|
import {
|
|
5
12
|
HoldoutAuditor,
|
|
6
13
|
canaryLeakView,
|
|
@@ -31,12 +38,12 @@ import {
|
|
|
31
38
|
scoreRedTeamOutput,
|
|
32
39
|
surfaceContentHash,
|
|
33
40
|
toolNamesForRun
|
|
34
|
-
} from "./chunk-
|
|
41
|
+
} from "./chunk-4QJN7RDX.js";
|
|
35
42
|
import {
|
|
36
43
|
BackendIntegrityError,
|
|
37
44
|
assertRealBackend,
|
|
38
45
|
summarizeBackendIntegrity
|
|
39
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-ZPSKPT3V.js";
|
|
40
47
|
import {
|
|
41
48
|
BENCHMARK_SPLIT_SEED,
|
|
42
49
|
benchmarks_exports,
|
|
@@ -3093,158 +3100,6 @@ var ConvergenceTracker = class {
|
|
|
3093
3100
|
}
|
|
3094
3101
|
};
|
|
3095
3102
|
|
|
3096
|
-
// src/metrics.ts
|
|
3097
|
-
var MODEL_PRICING = {
|
|
3098
|
-
"gpt-4o": { input: 25e-4, output: 0.01 },
|
|
3099
|
-
"gpt-4o-mini": { input: 15e-5, output: 6e-4 },
|
|
3100
|
-
"gpt-4-turbo": { input: 0.01, output: 0.03 },
|
|
3101
|
-
"claude-sonnet-4-20250514": { input: 3e-3, output: 0.015 },
|
|
3102
|
-
"claude-opus-4-20250514": { input: 0.015, output: 0.075 },
|
|
3103
|
-
"claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
|
|
3104
|
-
};
|
|
3105
|
-
var FAMILY_PRICING = [
|
|
3106
|
-
[/claude.*opus/, { input: 0.015, output: 0.075 }],
|
|
3107
|
-
[/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
|
|
3108
|
-
[/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
|
|
3109
|
-
[/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
|
|
3110
|
-
[/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
|
|
3111
|
-
[/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
|
|
3112
|
-
[/deepseek/, { input: 3e-4, output: 11e-4 }],
|
|
3113
|
-
[/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
|
|
3114
|
-
[/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
|
|
3115
|
-
[/qwen/, { input: 4e-4, output: 12e-4 }],
|
|
3116
|
-
[/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
|
|
3117
|
-
[/gemini/, { input: 125e-5, output: 5e-3 }],
|
|
3118
|
-
[/llama/, { input: 2e-4, output: 6e-4 }]
|
|
3119
|
-
];
|
|
3120
|
-
function normalizeModelId(model) {
|
|
3121
|
-
return (model.split("@")[0] ?? model).trim().toLowerCase();
|
|
3122
|
-
}
|
|
3123
|
-
function resolveModelPricing(model) {
|
|
3124
|
-
if (MODEL_PRICING[model]) return MODEL_PRICING[model];
|
|
3125
|
-
const id = normalizeModelId(model);
|
|
3126
|
-
if (MODEL_PRICING[id]) return MODEL_PRICING[id];
|
|
3127
|
-
for (const [pattern, price] of FAMILY_PRICING) {
|
|
3128
|
-
if (pattern.test(id)) return price;
|
|
3129
|
-
}
|
|
3130
|
-
return null;
|
|
3131
|
-
}
|
|
3132
|
-
function isModelPriced(model) {
|
|
3133
|
-
return resolveModelPricing(model) !== null;
|
|
3134
|
-
}
|
|
3135
|
-
var warnedUnpricedModels = /* @__PURE__ */ new Set();
|
|
3136
|
-
function estimateTokens(text) {
|
|
3137
|
-
return Math.ceil(text.length / 4);
|
|
3138
|
-
}
|
|
3139
|
-
function estimateCost(inputTokens, outputTokens, model) {
|
|
3140
|
-
const pricing = resolveModelPricing(model);
|
|
3141
|
-
if (!pricing) {
|
|
3142
|
-
if (!warnedUnpricedModels.has(model)) {
|
|
3143
|
-
warnedUnpricedModels.add(model);
|
|
3144
|
-
console.warn(
|
|
3145
|
-
`estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
|
|
3146
|
-
);
|
|
3147
|
-
}
|
|
3148
|
-
return 0;
|
|
3149
|
-
}
|
|
3150
|
-
return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
|
|
3151
|
-
}
|
|
3152
|
-
var TokenCounter = class {
|
|
3153
|
-
totalInput = 0;
|
|
3154
|
-
totalOutput = 0;
|
|
3155
|
-
totalCost = 0;
|
|
3156
|
-
model;
|
|
3157
|
-
constructor(model = "gpt-4o") {
|
|
3158
|
-
this.model = model;
|
|
3159
|
-
}
|
|
3160
|
-
/** Record tokens for a turn, returns per-turn cost */
|
|
3161
|
-
record(inputTokens, outputTokens) {
|
|
3162
|
-
this.totalInput += inputTokens;
|
|
3163
|
-
this.totalOutput += outputTokens;
|
|
3164
|
-
const cost = estimateCost(inputTokens, outputTokens, this.model);
|
|
3165
|
-
this.totalCost += cost;
|
|
3166
|
-
return cost;
|
|
3167
|
-
}
|
|
3168
|
-
/** Estimate and record from raw text */
|
|
3169
|
-
recordFromText(inputText, outputText) {
|
|
3170
|
-
const inputTokens = estimateTokens(inputText);
|
|
3171
|
-
const outputTokens = estimateTokens(outputText);
|
|
3172
|
-
const cost = this.record(inputTokens, outputTokens);
|
|
3173
|
-
return { inputTokens, outputTokens, cost };
|
|
3174
|
-
}
|
|
3175
|
-
getTotalInput() {
|
|
3176
|
-
return this.totalInput;
|
|
3177
|
-
}
|
|
3178
|
-
getTotalOutput() {
|
|
3179
|
-
return this.totalOutput;
|
|
3180
|
-
}
|
|
3181
|
-
getTotalCost() {
|
|
3182
|
-
return this.totalCost;
|
|
3183
|
-
}
|
|
3184
|
-
};
|
|
3185
|
-
var MetricsCollector = class {
|
|
3186
|
-
client;
|
|
3187
|
-
workspaceId;
|
|
3188
|
-
metrics = [];
|
|
3189
|
-
constructor(client, workspaceId) {
|
|
3190
|
-
this.client = client;
|
|
3191
|
-
this.workspaceId = workspaceId;
|
|
3192
|
-
}
|
|
3193
|
-
/** Collect metrics after a turn completes */
|
|
3194
|
-
async collect(turn, responseLatencyMs, responseChars, codeBlocksProduced, blocksExtracted, completionCriteriaMet, completionCriteriaTotal, qualityScore, inputTokens = 0, outputTokens = 0, estimatedCostUsd = 0) {
|
|
3195
|
-
const state = await this.getState();
|
|
3196
|
-
const m = {
|
|
3197
|
-
turn,
|
|
3198
|
-
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3199
|
-
tasks: state.tasks,
|
|
3200
|
-
events: state.events,
|
|
3201
|
-
proposals: state.proposals,
|
|
3202
|
-
vaultFiles: state.vaultFiles.length,
|
|
3203
|
-
responseLatencyMs,
|
|
3204
|
-
responseChars,
|
|
3205
|
-
codeBlocksProduced,
|
|
3206
|
-
blocksExtracted,
|
|
3207
|
-
qualityScore,
|
|
3208
|
-
inputTokens,
|
|
3209
|
-
outputTokens,
|
|
3210
|
-
estimatedCostUsd,
|
|
3211
|
-
totalCostUsd: estimatedCostUsd,
|
|
3212
|
-
completionPercent: completionCriteriaTotal > 0 ? completionCriteriaMet / completionCriteriaTotal * 100 : 0
|
|
3213
|
-
};
|
|
3214
|
-
this.metrics.push(m);
|
|
3215
|
-
return m;
|
|
3216
|
-
}
|
|
3217
|
-
/** Get current product state */
|
|
3218
|
-
async getState() {
|
|
3219
|
-
const [tasks, events, approvals, vaultFiles] = await Promise.all([
|
|
3220
|
-
this.client.getTasks(this.workspaceId),
|
|
3221
|
-
this.client.getEvents(this.workspaceId),
|
|
3222
|
-
this.client.getApprovals(this.workspaceId),
|
|
3223
|
-
this.client.getVaultTree(this.workspaceId)
|
|
3224
|
-
]);
|
|
3225
|
-
return {
|
|
3226
|
-
tasks: tasks.length,
|
|
3227
|
-
events: events.length,
|
|
3228
|
-
proposals: {
|
|
3229
|
-
pending: approvals.filter((a) => a.status === "pending").length,
|
|
3230
|
-
approved: approvals.filter((a) => a.status === "approved").length,
|
|
3231
|
-
rejected: approvals.filter((a) => a.status === "rejected").length
|
|
3232
|
-
},
|
|
3233
|
-
vaultFiles,
|
|
3234
|
-
codeBlocks: 0,
|
|
3235
|
-
generations: 0
|
|
3236
|
-
};
|
|
3237
|
-
}
|
|
3238
|
-
/** Get all collected metrics */
|
|
3239
|
-
getMetrics() {
|
|
3240
|
-
return [...this.metrics];
|
|
3241
|
-
}
|
|
3242
|
-
/** Get convergence curve (completion% over turns) */
|
|
3243
|
-
getConvergenceCurve() {
|
|
3244
|
-
return this.metrics.map((m) => m.completionPercent);
|
|
3245
|
-
}
|
|
3246
|
-
};
|
|
3247
|
-
|
|
3248
3103
|
// src/driver.ts
|
|
3249
3104
|
var RIGOR_STANCE = {
|
|
3250
3105
|
cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
|