@tangle-network/agent-eval 0.71.0 → 0.72.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,13 @@
1
1
  import {
2
- agentProfileHash
3
- } from "./chunk-PQV2TKC3.js";
2
+ MODEL_PRICING,
3
+ MetricsCollector,
4
+ TokenCounter,
5
+ agentProfileHash,
6
+ estimateCost,
7
+ estimateTokens,
8
+ isModelPriced,
9
+ resolveModelPricing
10
+ } from "./chunk-SL55X4VN.js";
4
11
  import {
5
12
  HoldoutAuditor,
6
13
  canaryLeakView,
@@ -31,12 +38,12 @@ import {
31
38
  scoreRedTeamOutput,
32
39
  surfaceContentHash,
33
40
  toolNamesForRun
34
- } from "./chunk-VMAYE3LM.js";
41
+ } from "./chunk-4QJN7RDX.js";
35
42
  import {
36
43
  BackendIntegrityError,
37
44
  assertRealBackend,
38
45
  summarizeBackendIntegrity
39
- } from "./chunk-6XQIEUQ2.js";
46
+ } from "./chunk-ZPSKPT3V.js";
40
47
  import {
41
48
  BENCHMARK_SPLIT_SEED,
42
49
  benchmarks_exports,
@@ -3093,158 +3100,6 @@ var ConvergenceTracker = class {
3093
3100
  }
3094
3101
  };
3095
3102
 
3096
- // src/metrics.ts
3097
- var MODEL_PRICING = {
3098
- "gpt-4o": { input: 25e-4, output: 0.01 },
3099
- "gpt-4o-mini": { input: 15e-5, output: 6e-4 },
3100
- "gpt-4-turbo": { input: 0.01, output: 0.03 },
3101
- "claude-sonnet-4-20250514": { input: 3e-3, output: 0.015 },
3102
- "claude-opus-4-20250514": { input: 0.015, output: 0.075 },
3103
- "claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
3104
- };
3105
- var FAMILY_PRICING = [
3106
- [/claude.*opus/, { input: 0.015, output: 0.075 }],
3107
- [/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
3108
- [/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
3109
- [/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
3110
- [/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
3111
- [/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
3112
- [/deepseek/, { input: 3e-4, output: 11e-4 }],
3113
- [/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
3114
- [/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
3115
- [/qwen/, { input: 4e-4, output: 12e-4 }],
3116
- [/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
3117
- [/gemini/, { input: 125e-5, output: 5e-3 }],
3118
- [/llama/, { input: 2e-4, output: 6e-4 }]
3119
- ];
3120
- function normalizeModelId(model) {
3121
- return (model.split("@")[0] ?? model).trim().toLowerCase();
3122
- }
3123
- function resolveModelPricing(model) {
3124
- if (MODEL_PRICING[model]) return MODEL_PRICING[model];
3125
- const id = normalizeModelId(model);
3126
- if (MODEL_PRICING[id]) return MODEL_PRICING[id];
3127
- for (const [pattern, price] of FAMILY_PRICING) {
3128
- if (pattern.test(id)) return price;
3129
- }
3130
- return null;
3131
- }
3132
- function isModelPriced(model) {
3133
- return resolveModelPricing(model) !== null;
3134
- }
3135
- var warnedUnpricedModels = /* @__PURE__ */ new Set();
3136
- function estimateTokens(text) {
3137
- return Math.ceil(text.length / 4);
3138
- }
3139
- function estimateCost(inputTokens, outputTokens, model) {
3140
- const pricing = resolveModelPricing(model);
3141
- if (!pricing) {
3142
- if (!warnedUnpricedModels.has(model)) {
3143
- warnedUnpricedModels.add(model);
3144
- console.warn(
3145
- `estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
3146
- );
3147
- }
3148
- return 0;
3149
- }
3150
- return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
3151
- }
3152
- var TokenCounter = class {
3153
- totalInput = 0;
3154
- totalOutput = 0;
3155
- totalCost = 0;
3156
- model;
3157
- constructor(model = "gpt-4o") {
3158
- this.model = model;
3159
- }
3160
- /** Record tokens for a turn, returns per-turn cost */
3161
- record(inputTokens, outputTokens) {
3162
- this.totalInput += inputTokens;
3163
- this.totalOutput += outputTokens;
3164
- const cost = estimateCost(inputTokens, outputTokens, this.model);
3165
- this.totalCost += cost;
3166
- return cost;
3167
- }
3168
- /** Estimate and record from raw text */
3169
- recordFromText(inputText, outputText) {
3170
- const inputTokens = estimateTokens(inputText);
3171
- const outputTokens = estimateTokens(outputText);
3172
- const cost = this.record(inputTokens, outputTokens);
3173
- return { inputTokens, outputTokens, cost };
3174
- }
3175
- getTotalInput() {
3176
- return this.totalInput;
3177
- }
3178
- getTotalOutput() {
3179
- return this.totalOutput;
3180
- }
3181
- getTotalCost() {
3182
- return this.totalCost;
3183
- }
3184
- };
3185
- var MetricsCollector = class {
3186
- client;
3187
- workspaceId;
3188
- metrics = [];
3189
- constructor(client, workspaceId) {
3190
- this.client = client;
3191
- this.workspaceId = workspaceId;
3192
- }
3193
- /** Collect metrics after a turn completes */
3194
- async collect(turn, responseLatencyMs, responseChars, codeBlocksProduced, blocksExtracted, completionCriteriaMet, completionCriteriaTotal, qualityScore, inputTokens = 0, outputTokens = 0, estimatedCostUsd = 0) {
3195
- const state = await this.getState();
3196
- const m = {
3197
- turn,
3198
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3199
- tasks: state.tasks,
3200
- events: state.events,
3201
- proposals: state.proposals,
3202
- vaultFiles: state.vaultFiles.length,
3203
- responseLatencyMs,
3204
- responseChars,
3205
- codeBlocksProduced,
3206
- blocksExtracted,
3207
- qualityScore,
3208
- inputTokens,
3209
- outputTokens,
3210
- estimatedCostUsd,
3211
- totalCostUsd: estimatedCostUsd,
3212
- completionPercent: completionCriteriaTotal > 0 ? completionCriteriaMet / completionCriteriaTotal * 100 : 0
3213
- };
3214
- this.metrics.push(m);
3215
- return m;
3216
- }
3217
- /** Get current product state */
3218
- async getState() {
3219
- const [tasks, events, approvals, vaultFiles] = await Promise.all([
3220
- this.client.getTasks(this.workspaceId),
3221
- this.client.getEvents(this.workspaceId),
3222
- this.client.getApprovals(this.workspaceId),
3223
- this.client.getVaultTree(this.workspaceId)
3224
- ]);
3225
- return {
3226
- tasks: tasks.length,
3227
- events: events.length,
3228
- proposals: {
3229
- pending: approvals.filter((a) => a.status === "pending").length,
3230
- approved: approvals.filter((a) => a.status === "approved").length,
3231
- rejected: approvals.filter((a) => a.status === "rejected").length
3232
- },
3233
- vaultFiles,
3234
- codeBlocks: 0,
3235
- generations: 0
3236
- };
3237
- }
3238
- /** Get all collected metrics */
3239
- getMetrics() {
3240
- return [...this.metrics];
3241
- }
3242
- /** Get convergence curve (completion% over turns) */
3243
- getConvergenceCurve() {
3244
- return this.metrics.map((m) => m.completionPercent);
3245
- }
3246
- };
3247
-
3248
3103
  // src/driver.ts
3249
3104
  var RIGOR_STANCE = {
3250
3105
  cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",