@ls-stack/agent-eval 0.58.5 → 0.59.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{app-DLNmRUqH.mjs → app-B3PEtWqH.mjs} +4 -4
- package/dist/apps/web/dist/assets/index-BIEuCK_8.js +377 -0
- package/dist/apps/web/dist/assets/index-CWoKLKTt.css +1 -0
- package/dist/apps/web/dist/index.html +2 -2
- package/dist/bin.mjs +1 -1
- package/dist/caseChild.mjs +1 -1
- package/dist/{cli-ClAkjTvo.mjs → cli-Dkp2-rBm.mjs} +4 -4
- package/dist/index.d.mts +100 -64
- package/dist/index.mjs +3 -3
- package/dist/runChild.mjs +2 -2
- package/dist/{runExecution-BMnJXWhN.mjs → runExecution-C3XVZHRC.mjs} +94 -6
- package/dist/{runOrchestration-CvbTAoEb.mjs → runOrchestration-B5An-AEi.mjs} +1 -1
- package/dist/{runner-DJJekv9f.mjs → runner-BJXz_V_V.mjs} +1 -1
- package/dist/{runner-BfHgVhGS.mjs → runner-C9J-1fkp.mjs} +2 -2
- package/dist/{src-DfzidkYr.mjs → src-8dGXUULC.mjs} +2 -2
- package/package.json +3 -3
- package/skills/agent-eval/SKILL.md +10 -2
- package/dist/apps/web/dist/assets/index-BD6FXk5p.js +0 -377
- package/dist/apps/web/dist/assets/index-C2fbGEsB.css +0 -1
|
@@ -5142,8 +5142,88 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5142
5142
|
visit(null);
|
|
5143
5143
|
return result;
|
|
5144
5144
|
};
|
|
5145
|
+
const isRecord = (value) => {
|
|
5146
|
+
return typeof value === "object" && value !== null;
|
|
5147
|
+
};
|
|
5148
|
+
const readRecordValue = (value, key) => {
|
|
5149
|
+
if (!isRecord(value)) return void 0;
|
|
5150
|
+
const child = value[key];
|
|
5151
|
+
return isRecord(child) ? child : void 0;
|
|
5152
|
+
};
|
|
5153
|
+
const readStringValue = (value, key) => {
|
|
5154
|
+
if (!isRecord(value)) return void 0;
|
|
5155
|
+
const child = value[key];
|
|
5156
|
+
return typeof child === "string" && child.length > 0 ? child : void 0;
|
|
5157
|
+
};
|
|
5158
|
+
const readValue = (value, key) => {
|
|
5159
|
+
if (!isRecord(value)) return void 0;
|
|
5160
|
+
return value[key];
|
|
5161
|
+
};
|
|
5162
|
+
const parseMaybeJson = (value) => {
|
|
5163
|
+
if (typeof value !== "string") return value;
|
|
5164
|
+
const parsed = resultify(() => JSON.parse(value));
|
|
5165
|
+
return parsed.error ? value : parsed.value;
|
|
5166
|
+
};
|
|
5167
|
+
const firstDefined = (values) => {
|
|
5168
|
+
return values.find((value) => value !== void 0);
|
|
5169
|
+
};
|
|
5170
|
+
const getToolCallMetadata = (span) => {
|
|
5171
|
+
const attributes = span.attributes;
|
|
5172
|
+
return {
|
|
5173
|
+
attributes,
|
|
5174
|
+
genAI: readRecordValue(attributes, "genAI"),
|
|
5175
|
+
mastra: readRecordValue(attributes, "mastra"),
|
|
5176
|
+
toolAttributes: readRecordValue(attributes, "attributes")
|
|
5177
|
+
};
|
|
5178
|
+
};
|
|
5145
5179
|
const isToolCallSpan = (span) => {
|
|
5146
|
-
|
|
5180
|
+
const { attributes, genAI, mastra } = getToolCallMetadata(span);
|
|
5181
|
+
return span.kind === "tool" || span.kind === "tool_call" || readStringValue(attributes, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "gen_ai.tool.type") === "tool" || readStringValue(genAI, "mastra.span.type") === "tool_call" || readStringValue(mastra, "type") === "tool_call" || readStringValue(mastra, "entityType") === "tool";
|
|
5182
|
+
};
|
|
5183
|
+
const getToolCallIdentityNames = (span) => {
|
|
5184
|
+
const { attributes, genAI, mastra } = getToolCallMetadata(span);
|
|
5185
|
+
return [
|
|
5186
|
+
readStringValue(attributes, "gen_ai.tool.name"),
|
|
5187
|
+
readStringValue(genAI, "gen_ai.tool.name"),
|
|
5188
|
+
readStringValue(mastra, "entityName"),
|
|
5189
|
+
readStringValue(mastra, "entityId"),
|
|
5190
|
+
span.name
|
|
5191
|
+
].filter((name) => name !== void 0);
|
|
5192
|
+
};
|
|
5193
|
+
const getPreferredToolCallName = (span) => {
|
|
5194
|
+
return getToolCallIdentityNames(span)[0] ?? span.name;
|
|
5195
|
+
};
|
|
5196
|
+
const toolCallSpanMatchesName = (span, toolName) => {
|
|
5197
|
+
return getToolCallIdentityNames(span).includes(toolName);
|
|
5198
|
+
};
|
|
5199
|
+
const countToolCallSpans = (toolName) => {
|
|
5200
|
+
return spans.filter((span) => {
|
|
5201
|
+
return isToolCallSpan(span) && toolCallSpanMatchesName(span, toolName);
|
|
5202
|
+
}).length;
|
|
5203
|
+
};
|
|
5204
|
+
const buildToolCallSpan = (span) => {
|
|
5205
|
+
const { attributes, genAI, toolAttributes } = getToolCallMetadata(span);
|
|
5206
|
+
return {
|
|
5207
|
+
name: getPreferredToolCallName(span),
|
|
5208
|
+
spanName: span.name,
|
|
5209
|
+
kind: span.kind,
|
|
5210
|
+
arguments: parseMaybeJson(firstDefined([
|
|
5211
|
+
readValue(attributes, "gen_ai.tool.call.arguments"),
|
|
5212
|
+
readValue(genAI, "gen_ai.tool.call.arguments"),
|
|
5213
|
+
readValue(attributes, "arguments"),
|
|
5214
|
+
readValue(attributes, "input")
|
|
5215
|
+
])),
|
|
5216
|
+
result: parseMaybeJson(firstDefined([
|
|
5217
|
+
readValue(attributes, "gen_ai.tool.call.result"),
|
|
5218
|
+
readValue(genAI, "gen_ai.tool.call.result"),
|
|
5219
|
+
readValue(attributes, "result"),
|
|
5220
|
+
readValue(attributes, "output")
|
|
5221
|
+
])),
|
|
5222
|
+
description: readStringValue(attributes, "gen_ai.tool.description") ?? readStringValue(genAI, "gen_ai.tool.description") ?? readStringValue(toolAttributes, "toolDescription"),
|
|
5223
|
+
toolType: readStringValue(attributes, "gen_ai.tool.type") ?? readStringValue(genAI, "gen_ai.tool.type") ?? readStringValue(toolAttributes, "toolType"),
|
|
5224
|
+
attributes,
|
|
5225
|
+
span
|
|
5226
|
+
};
|
|
5147
5227
|
};
|
|
5148
5228
|
const filterSpanNames = (sourceSpans, kind) => {
|
|
5149
5229
|
return sourceSpans.filter((span) => kind === void 0 || span.kind === kind).map((span) => span.name);
|
|
@@ -5167,15 +5247,23 @@ function buildTraceTree(spans, checkpoints) {
|
|
|
5167
5247
|
return spans.filter(isToolCallSpan);
|
|
5168
5248
|
},
|
|
5169
5249
|
listToolCallSpanNames() {
|
|
5170
|
-
return spans.filter(isToolCallSpan).map(
|
|
5250
|
+
return spans.filter(isToolCallSpan).map(getPreferredToolCallName);
|
|
5171
5251
|
},
|
|
5172
5252
|
hasToolCallSpan(name) {
|
|
5173
|
-
return spans.some((s) =>
|
|
5253
|
+
return spans.some((s) => {
|
|
5254
|
+
return isToolCallSpan(s) && toolCallSpanMatchesName(s, name);
|
|
5255
|
+
});
|
|
5174
5256
|
},
|
|
5175
|
-
|
|
5257
|
+
getToolCallSpans(name) {
|
|
5176
5258
|
return spans.filter((span) => {
|
|
5177
|
-
return isToolCallSpan(span) && span
|
|
5178
|
-
}).
|
|
5259
|
+
return isToolCallSpan(span) && toolCallSpanMatchesName(span, name);
|
|
5260
|
+
}).map(buildToolCallSpan);
|
|
5261
|
+
},
|
|
5262
|
+
getToolCallSpanCount(toolName) {
|
|
5263
|
+
return countToolCallSpans(toolName);
|
|
5264
|
+
},
|
|
5265
|
+
hasToolCallSpanCount(toolName, expectedCalls) {
|
|
5266
|
+
return countToolCallSpans(toolName) === expectedCalls;
|
|
5179
5267
|
},
|
|
5180
5268
|
listSpanNames(kind) {
|
|
5181
5269
|
return filterSpanNames(spans, kind);
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-
|
|
1
|
+
import { Dt as getCaseRowCaseKey, Lt as runWithEvalRegistry, Ot as caseDetailSchema, Q as runWithEvalClock, St as runSummarySchema, Tt as buildCaseKey, Y as runInEvalRuntimeScope, _t as dedupeEvalTags, bt as validateTagsFilterExpression, d as loadEvalModule, f as resolveEvalDefaultConfig, g as commitPendingCacheWrites, gt as deriveStatusFromChildStatuses, ht as deriveStatusFromCaseRows, i as isCaseChildMessage, kt as caseRowSchema, m as buildDeclaredColumnDefs, mt as deriveScopedSummaryFromCases, n as resolveRunnableEvalCases, o as stripTerminalControlCodes, t as filterEvalCases, u as runWithModuleIsolation, vt as matchesTagsFilter, xt as runManifestSchema, yt as validateEvalTagName } from "./runExecution-C3XVZHRC.mjs";
|
|
2
2
|
import { Result, resultify } from "t-result";
|
|
3
3
|
import { readFile, readdir, rm, writeFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join } from "node:path";
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { n as initRunner, t as getRunnerInstance } from "./runner-
|
|
1
|
+
import { n as initRunner, t as getRunnerInstance } from "./runner-C9J-1fkp.mjs";
|
|
2
2
|
export { getRunnerInstance, initRunner };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { n as createRunner } from "./cli-
|
|
2
|
-
import "./src-
|
|
1
|
+
import { n as createRunner } from "./cli-Dkp2-rBm.mjs";
|
|
2
|
+
import "./src-8dGXUULC.mjs";
|
|
3
3
|
//#region ../../apps/server/src/runner.ts
|
|
4
4
|
let runnerInstance = null;
|
|
5
5
|
function getRunnerInstance() {
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-
|
|
2
|
-
import "./cli-
|
|
1
|
+
import { Ft as defineEval$1, K as matchesEvalTags$1 } from "./runExecution-C3XVZHRC.mjs";
|
|
2
|
+
import "./cli-Dkp2-rBm.mjs";
|
|
3
3
|
//#region src/index.ts
|
|
4
4
|
/** Register an eval definition with typed tag support. */
|
|
5
5
|
function defineEval(definition) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ls-stack/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.59.1",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"bin": {
|
|
6
6
|
"agent-evals": "./dist/bin.mjs"
|
|
@@ -32,9 +32,9 @@
|
|
|
32
32
|
"devDependencies": {
|
|
33
33
|
"@types/node": "^24.7.2",
|
|
34
34
|
"typescript": "^5.9.2",
|
|
35
|
-
"@agent-evals/sdk": "0.0.1",
|
|
36
35
|
"@agent-evals/runner": "0.0.1",
|
|
37
|
-
"@agent-evals/shared": "0.0.1"
|
|
36
|
+
"@agent-evals/shared": "0.0.1",
|
|
37
|
+
"@agent-evals/sdk": "0.0.1"
|
|
38
38
|
},
|
|
39
39
|
"scripts": {
|
|
40
40
|
"build": "pnpm --filter @agent-evals/web build && pnpm --filter @agent-evals/shared build && pnpm --filter @agent-evals/sdk build && pnpm --filter @agent-evals/runner build && tsdown --filter cli-js && tsdown --filter cli-types",
|
|
@@ -375,11 +375,19 @@ See `EvalScoreDef` / `EvalManualScoreDef` in the types for the full shape
|
|
|
375
375
|
`trace.hasSpan(name)`, `trace.findSpansByKind(kind)`,
|
|
376
376
|
`trace.findToolCallSpans()`, `trace.listToolCallSpanNames()`,
|
|
377
377
|
`trace.hasToolCallSpan(name)`,
|
|
378
|
-
`trace.
|
|
378
|
+
`trace.getToolCallSpans(name)`,
|
|
379
|
+
`trace.getToolCallSpanCount(toolName)`,
|
|
380
|
+
`trace.hasToolCallSpanCount(toolName, expectedCalls)`,
|
|
379
381
|
`trace.listSpanNames(kind?)`, `trace.listSpanNamesDfs(kind?)`, and
|
|
380
382
|
`trace.flattenDfs()`.
|
|
381
383
|
The tool-call helpers include both `kind: 'tool'` spans and imported
|
|
382
|
-
execution spans recorded as `kind: 'tool_call'`.
|
|
384
|
+
execution spans recorded as `kind: 'tool_call'`. Tool-name checks and counts
|
|
385
|
+
match the span `name` as well as GenAI/Mastra identity attributes such as
|
|
386
|
+
`genAI["gen_ai.tool.name"]` and `mastra.entityName`; list helpers prefer
|
|
387
|
+
those tool identity attributes when present. `getToolCallSpans(name)`
|
|
388
|
+
returns one normalized object per matching call, including parsed
|
|
389
|
+
`arguments`, parsed `result`, `description`, `toolType`, `attributes`, and
|
|
390
|
+
the original `span`.
|
|
383
391
|
- `traceDisplay` promotes selected span attributes into the trace tree and
|
|
384
392
|
detail pane; it supports aggregation across subtrees (`scope`, `mode`) and
|
|
385
393
|
user-defined `transform(...)` for derived views (e.g. currency conversion).
|