@tangle-network/agent-eval 0.70.0 → 0.72.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/CHANGELOG.md +24 -0
  2. package/dist/adapters/http.js +1 -1
  3. package/dist/adapters/http.js.map +1 -1
  4. package/dist/campaign/index.d.ts +10 -0
  5. package/dist/campaign/index.js +48 -11
  6. package/dist/campaign/index.js.map +1 -1
  7. package/dist/{chunk-ZZCQQHW7.js → chunk-4QJN7RDX.js} +4 -4
  8. package/dist/chunk-4QJN7RDX.js.map +1 -0
  9. package/dist/{chunk-3B7Y5AUR.js → chunk-GWGO2K6Y.js} +3 -2
  10. package/dist/chunk-GWGO2K6Y.js.map +1 -0
  11. package/dist/{chunk-Z4ZCBC7M.js → chunk-ODGETRTM.js} +4 -3
  12. package/dist/chunk-ODGETRTM.js.map +1 -0
  13. package/dist/chunk-SL55X4VN.js +186 -0
  14. package/dist/chunk-SL55X4VN.js.map +1 -0
  15. package/dist/{chunk-GYELOWB6.js → chunk-UD6EF73X.js} +3 -3
  16. package/dist/{chunk-6XQIEUQ2.js → chunk-ZPSKPT3V.js} +5 -3
  17. package/dist/{chunk-6XQIEUQ2.js.map → chunk-ZPSKPT3V.js.map} +1 -1
  18. package/dist/contract/index.js +3 -3
  19. package/dist/index.js +31 -171
  20. package/dist/index.js.map +1 -1
  21. package/dist/openapi.json +1 -1
  22. package/dist/pipelines/index.js +1 -1
  23. package/dist/rl.d.ts +155 -1
  24. package/dist/rl.js +195 -6
  25. package/dist/rl.js.map +1 -1
  26. package/dist/{run-campaign-BVY3RGAZ.js → run-campaign-OVEZF24D.js} +2 -2
  27. package/dist/traces.js +1 -1
  28. package/package.json +1 -1
  29. package/dist/chunk-3B7Y5AUR.js.map +0 -1
  30. package/dist/chunk-PQV2TKC3.js +0 -27
  31. package/dist/chunk-PQV2TKC3.js.map +0 -1
  32. package/dist/chunk-Z4ZCBC7M.js.map +0 -1
  33. package/dist/chunk-ZZCQQHW7.js.map +0 -1
  34. /package/dist/{chunk-GYELOWB6.js.map → chunk-UD6EF73X.js.map} +0 -0
  35. /package/dist/{run-campaign-BVY3RGAZ.js.map → run-campaign-OVEZF24D.js.map} +0 -0
package/dist/index.js CHANGED
@@ -1,6 +1,13 @@
1
1
  import {
2
- agentProfileHash
3
- } from "./chunk-PQV2TKC3.js";
2
+ MODEL_PRICING,
3
+ MetricsCollector,
4
+ TokenCounter,
5
+ agentProfileHash,
6
+ estimateCost,
7
+ estimateTokens,
8
+ isModelPriced,
9
+ resolveModelPricing
10
+ } from "./chunk-SL55X4VN.js";
4
11
  import {
5
12
  HoldoutAuditor,
6
13
  canaryLeakView,
@@ -31,12 +38,12 @@ import {
31
38
  scoreRedTeamOutput,
32
39
  surfaceContentHash,
33
40
  toolNamesForRun
34
- } from "./chunk-ZZCQQHW7.js";
41
+ } from "./chunk-4QJN7RDX.js";
35
42
  import {
36
43
  BackendIntegrityError,
37
44
  assertRealBackend,
38
45
  summarizeBackendIntegrity
39
- } from "./chunk-6XQIEUQ2.js";
46
+ } from "./chunk-ZPSKPT3V.js";
40
47
  import {
41
48
  BENCHMARK_SPLIT_SEED,
42
49
  benchmarks_exports,
@@ -49,7 +56,7 @@ import {
49
56
  computeToolUseMetrics,
50
57
  iqr,
51
58
  welchsTTest
52
- } from "./chunk-3B7Y5AUR.js";
59
+ } from "./chunk-GWGO2K6Y.js";
53
60
  import {
54
61
  exportTrainingData,
55
62
  toNdjson
@@ -204,7 +211,7 @@ import {
204
211
  tokenizeDomainWords,
205
212
  traceAnalystFunctionGroup,
206
213
  traceAnalystOnRunComplete
207
- } from "./chunk-Z4ZCBC7M.js";
214
+ } from "./chunk-ODGETRTM.js";
208
215
  import {
209
216
  DEFAULT_REDACTION_RULES,
210
217
  REDACTION_VERSION,
@@ -2595,14 +2602,15 @@ async function executeScenario(tc, scenario, config) {
2595
2602
  const content = resp.choices?.[0]?.message?.content ?? "";
2596
2603
  messages.push({ role: "assistant", content });
2597
2604
  const codeRe = /```(\w+)?\n([\s\S]*?)```/g;
2598
- let codeMatch;
2599
- while ((codeMatch = codeRe.exec(content)) !== null) {
2605
+ let codeMatch = codeRe.exec(content);
2606
+ while (codeMatch !== null) {
2600
2607
  allCodeBlocks.push({ language: codeMatch[1] ?? "text", code: codeMatch[2] ?? "" });
2608
+ codeMatch = codeRe.exec(content);
2601
2609
  }
2602
2610
  const turnBlocks = [];
2603
- let blockMatch;
2604
2611
  const blockReLocal = new RegExp(blockRe.source, blockRe.flags);
2605
- while ((blockMatch = blockReLocal.exec(content)) !== null) {
2612
+ let blockMatch = blockReLocal.exec(content);
2613
+ while (blockMatch !== null) {
2606
2614
  const fields = {};
2607
2615
  for (const line of (blockMatch[2] ?? "").split("\n")) {
2608
2616
  const idx = line.indexOf(":");
@@ -2611,15 +2619,17 @@ async function executeScenario(tc, scenario, config) {
2611
2619
  const blockType = blockMatch[1] ?? "";
2612
2620
  allBlocks.push({ type: blockType, fields });
2613
2621
  turnBlocks.push({ type: blockType, title: fields.title ?? "" });
2622
+ blockMatch = blockReLocal.exec(content);
2614
2623
  }
2615
2624
  let hasToolCall = false;
2616
2625
  if (config.toolCallPatterns) {
2617
2626
  for (const pattern of config.toolCallPatterns) {
2618
2627
  const re = new RegExp(pattern.source, pattern.flags);
2619
- let toolMatch;
2620
- while ((toolMatch = re.exec(content)) !== null) {
2628
+ let toolMatch = re.exec(content);
2629
+ while (toolMatch !== null) {
2621
2630
  allToolCalls.push(toolMatch[0]);
2622
2631
  hasToolCall = true;
2632
+ toolMatch = re.exec(content);
2623
2633
  }
2624
2634
  }
2625
2635
  }
@@ -2941,14 +2951,15 @@ var ProductClient = class {
2941
2951
  }
2942
2952
  }
2943
2953
  const blockRe = /:::(\w+)\s*\n([\s\S]*?)\n\s*:::/g;
2944
- let match;
2945
- while ((match = blockRe.exec(text)) !== null) {
2954
+ let match = blockRe.exec(text);
2955
+ while (match !== null) {
2946
2956
  const fields = {};
2947
2957
  for (const line of match[2].split("\n")) {
2948
2958
  const idx = line.indexOf(":");
2949
2959
  if (idx > 0) fields[line.slice(0, idx).trim()] = line.slice(idx + 1).trim();
2950
2960
  }
2951
2961
  blocks.push({ type: match[1], title: fields.title ?? "" });
2962
+ match = blockRe.exec(text);
2952
2963
  }
2953
2964
  return { text, blocks };
2954
2965
  }
@@ -3089,158 +3100,6 @@ var ConvergenceTracker = class {
3089
3100
  }
3090
3101
  };
3091
3102
 
3092
- // src/metrics.ts
3093
- var MODEL_PRICING = {
3094
- "gpt-4o": { input: 25e-4, output: 0.01 },
3095
- "gpt-4o-mini": { input: 15e-5, output: 6e-4 },
3096
- "gpt-4-turbo": { input: 0.01, output: 0.03 },
3097
- "claude-sonnet-4-20250514": { input: 3e-3, output: 0.015 },
3098
- "claude-opus-4-20250514": { input: 0.015, output: 0.075 },
3099
- "claude-3-haiku-20240307": { input: 25e-5, output: 125e-5 }
3100
- };
3101
- var FAMILY_PRICING = [
3102
- [/claude.*opus/, { input: 0.015, output: 0.075 }],
3103
- [/claude.*haiku/, { input: 8e-4, output: 4e-3 }],
3104
- [/claude.*sonnet|claude-code|claude-sonnet/, { input: 3e-3, output: 0.015 }],
3105
- [/gpt-4o-mini/, { input: 15e-5, output: 6e-4 }],
3106
- [/gpt-5|gpt-4\.1|o[134]\b/, { input: 125e-5, output: 0.01 }],
3107
- [/gpt-4o|gpt-4/, { input: 25e-4, output: 0.01 }],
3108
- [/deepseek/, { input: 3e-4, output: 11e-4 }],
3109
- [/glm|zhipu|zai/, { input: 6e-4, output: 22e-4 }],
3110
- [/kimi|moonshot/, { input: 6e-4, output: 25e-4 }],
3111
- [/qwen/, { input: 4e-4, output: 12e-4 }],
3112
- [/gemini.*flash/, { input: 1e-4, output: 4e-4 }],
3113
- [/gemini/, { input: 125e-5, output: 5e-3 }],
3114
- [/llama/, { input: 2e-4, output: 6e-4 }]
3115
- ];
3116
- function normalizeModelId(model) {
3117
- return (model.split("@")[0] ?? model).trim().toLowerCase();
3118
- }
3119
- function resolveModelPricing(model) {
3120
- if (MODEL_PRICING[model]) return MODEL_PRICING[model];
3121
- const id = normalizeModelId(model);
3122
- if (MODEL_PRICING[id]) return MODEL_PRICING[id];
3123
- for (const [pattern, price] of FAMILY_PRICING) {
3124
- if (pattern.test(id)) return price;
3125
- }
3126
- return null;
3127
- }
3128
- function isModelPriced(model) {
3129
- return resolveModelPricing(model) !== null;
3130
- }
3131
- var warnedUnpricedModels = /* @__PURE__ */ new Set();
3132
- function estimateTokens(text) {
3133
- return Math.ceil(text.length / 4);
3134
- }
3135
- function estimateCost(inputTokens, outputTokens, model) {
3136
- const pricing = resolveModelPricing(model);
3137
- if (!pricing) {
3138
- if (!warnedUnpricedModels.has(model)) {
3139
- warnedUnpricedModels.add(model);
3140
- console.warn(
3141
- `estimateCost: no pricing for model "${model}" \u2014 returning 0; add it to MODEL_PRICING/FAMILY_PRICING (cost/Pareto axes will be blank until then)`
3142
- );
3143
- }
3144
- return 0;
3145
- }
3146
- return inputTokens / 1e3 * pricing.input + outputTokens / 1e3 * pricing.output;
3147
- }
3148
- var TokenCounter = class {
3149
- totalInput = 0;
3150
- totalOutput = 0;
3151
- totalCost = 0;
3152
- model;
3153
- constructor(model = "gpt-4o") {
3154
- this.model = model;
3155
- }
3156
- /** Record tokens for a turn, returns per-turn cost */
3157
- record(inputTokens, outputTokens) {
3158
- this.totalInput += inputTokens;
3159
- this.totalOutput += outputTokens;
3160
- const cost = estimateCost(inputTokens, outputTokens, this.model);
3161
- this.totalCost += cost;
3162
- return cost;
3163
- }
3164
- /** Estimate and record from raw text */
3165
- recordFromText(inputText, outputText) {
3166
- const inputTokens = estimateTokens(inputText);
3167
- const outputTokens = estimateTokens(outputText);
3168
- const cost = this.record(inputTokens, outputTokens);
3169
- return { inputTokens, outputTokens, cost };
3170
- }
3171
- getTotalInput() {
3172
- return this.totalInput;
3173
- }
3174
- getTotalOutput() {
3175
- return this.totalOutput;
3176
- }
3177
- getTotalCost() {
3178
- return this.totalCost;
3179
- }
3180
- };
3181
- var MetricsCollector = class {
3182
- client;
3183
- workspaceId;
3184
- metrics = [];
3185
- constructor(client, workspaceId) {
3186
- this.client = client;
3187
- this.workspaceId = workspaceId;
3188
- }
3189
- /** Collect metrics after a turn completes */
3190
- async collect(turn, responseLatencyMs, responseChars, codeBlocksProduced, blocksExtracted, completionCriteriaMet, completionCriteriaTotal, qualityScore, inputTokens = 0, outputTokens = 0, estimatedCostUsd = 0) {
3191
- const state = await this.getState();
3192
- const m = {
3193
- turn,
3194
- timestamp: (/* @__PURE__ */ new Date()).toISOString(),
3195
- tasks: state.tasks,
3196
- events: state.events,
3197
- proposals: state.proposals,
3198
- vaultFiles: state.vaultFiles.length,
3199
- responseLatencyMs,
3200
- responseChars,
3201
- codeBlocksProduced,
3202
- blocksExtracted,
3203
- qualityScore,
3204
- inputTokens,
3205
- outputTokens,
3206
- estimatedCostUsd,
3207
- totalCostUsd: estimatedCostUsd,
3208
- completionPercent: completionCriteriaTotal > 0 ? completionCriteriaMet / completionCriteriaTotal * 100 : 0
3209
- };
3210
- this.metrics.push(m);
3211
- return m;
3212
- }
3213
- /** Get current product state */
3214
- async getState() {
3215
- const [tasks, events, approvals, vaultFiles] = await Promise.all([
3216
- this.client.getTasks(this.workspaceId),
3217
- this.client.getEvents(this.workspaceId),
3218
- this.client.getApprovals(this.workspaceId),
3219
- this.client.getVaultTree(this.workspaceId)
3220
- ]);
3221
- return {
3222
- tasks: tasks.length,
3223
- events: events.length,
3224
- proposals: {
3225
- pending: approvals.filter((a) => a.status === "pending").length,
3226
- approved: approvals.filter((a) => a.status === "approved").length,
3227
- rejected: approvals.filter((a) => a.status === "rejected").length
3228
- },
3229
- vaultFiles,
3230
- codeBlocks: 0,
3231
- generations: 0
3232
- };
3233
- }
3234
- /** Get all collected metrics */
3235
- getMetrics() {
3236
- return [...this.metrics];
3237
- }
3238
- /** Get convergence curve (completion% over turns) */
3239
- getConvergenceCurve() {
3240
- return this.metrics.map((m) => m.completionPercent);
3241
- }
3242
- };
3243
-
3244
3103
  // src/driver.ts
3245
3104
  var RIGOR_STANCE = {
3246
3105
  cooperative: "Your stance: a pragmatic early adopter. You accept reasonable answers and only push back on clear gaps or outright errors.",
@@ -4680,8 +4539,8 @@ function analyzeAntiSlop(outputs, config) {
4680
4539
  const lower = output.toLowerCase();
4681
4540
  for (const phrase of config.bannedPhrases) {
4682
4541
  const needle = phrase.toLowerCase();
4683
- let idx = 0;
4684
- while ((idx = lower.indexOf(needle, idx)) !== -1) {
4542
+ let idx = lower.indexOf(needle, 0);
4543
+ while (idx !== -1) {
4685
4544
  counts.banned_phrase += 1;
4686
4545
  if (issues.length < 20) {
4687
4546
  issues.push({
@@ -4690,7 +4549,7 @@ function analyzeAntiSlop(outputs, config) {
4690
4549
  example: snippet(output, idx, phrase.length)
4691
4550
  });
4692
4551
  }
4693
- idx += needle.length;
4552
+ idx = lower.indexOf(needle, idx + needle.length);
4694
4553
  }
4695
4554
  }
4696
4555
  for (const re of config.bannedOpenings) {
@@ -8623,13 +8482,14 @@ function extractAssetUrls(html, baseUrl) {
8623
8482
  const linkRe = /<link\b[^>]*\bhref\s*=\s*["']([^"']+)["'][^>]*>/gi;
8624
8483
  const scriptRe = /<script\b[^>]*\bsrc\s*=\s*["']([^"']+)["'][^>]*>/gi;
8625
8484
  for (const re of [linkRe, scriptRe]) {
8626
- let match;
8627
- while ((match = re.exec(html)) !== null) {
8485
+ let match = re.exec(html);
8486
+ while (match !== null) {
8628
8487
  const raw = match[1];
8629
8488
  try {
8630
8489
  urls.add(new URL(raw, baseUrl).toString());
8631
8490
  } catch {
8632
8491
  }
8492
+ match = re.exec(html);
8633
8493
  }
8634
8494
  }
8635
8495
  return Array.from(urls);