npm - stable-harness - Versions diffs - 0.0.51 → 0.0.53 - Mend

stable-harness 0.0.51 → 0.0.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/docs/granite-tool-calling-comparison.zh.md +3 -3
package/docs/guides/evaluation-foundation.md +72 -0
package/docs/guides/index.md +2 -0
package/package.json +1 -1
package/packages/evaluation/dist/src/benchmark.d.ts +51 -0
package/packages/evaluation/dist/src/benchmark.js +1 -0
package/packages/evaluation/dist/src/evaluators.d.ts +68 -0
package/packages/evaluation/dist/src/evaluators.js +1 -0
package/packages/evaluation/dist/src/index.d.ts +6 -0
package/packages/evaluation/dist/src/index.js +1 -1
package/packages/evaluation/dist/src/run-record.d.ts +68 -0
package/packages/evaluation/dist/src/run-record.js +1 -0

package/docs/granite-tool-calling-comparison.zh.md CHANGED Viewed

@@ -185,10 +185,10 @@ Fast matrix results:
 | Ollama | `qwen3.5:9b` | native auto tools | 15/15 | none |
 | Ollama | `qwen3.5:0.8b` | native auto tools | 14/15 | `freshness` day instead of month |
 | Ollama | `granite4.1:3b` | native auto tools | 14/15 | Chinese news query became unnatural text |
-| Ollama | `qwen3:latest` | native auto tools | 14/15 | Workday ticker became `WORK` |
+| Ollama | `qwen3:latest` | native auto tools | 14/15 | ExampleCo ticker became `EXCO` |
 | Ollama | `qwen2.5:7b-instruct` | native auto tools | 14/15 | path with space was collapsed |
-| Ollama | `gemma4:e2b` | native auto tools | 14/15 | missed Chinese Workday stock tool call |
-| Ollama | `gemma4:e4b` | native auto tools | 14/15 | missed Chinese Workday stock tool call |
+| Ollama | `gemma4:e2b` | native auto tools | 14/15 | missed Chinese ExampleCo stock tool call |
+| Ollama | `gemma4:e4b` | native auto tools | 14/15 | missed Chinese ExampleCo stock tool call |
 | Ollama | `lfm2.5-thinking:latest` | native auto tools | 13/15 | namespace typo and HK market error |
 | Ollama | `qwen3:0.6b` | native auto tools | 12/15 | one timeout, weaker exact title/query handling |
 | Ollama | `gpt-oss:latest` | native auto tools | 12/15 | path and freshness enum errors |

package/docs/guides/evaluation-foundation.md ADDED Viewed

@@ -0,0 +1,72 @@
+# Evaluation Foundation
+Stable Harness evaluates DeepAgents workloads by recording facts first, then
+applying benchmark-neutral quality contracts. The foundation has three objects:
+- `StandardRunRecord`: a normalized record for stable-harness and pure
+  DeepAgents runs.
+- `QualityContract`: workspace-declared success criteria for final response,
+  tool calls, trajectory, workflow final state, control states, and approvals.
+- `BenchmarkSuiteReport`: a comparable report across runtime modes such as
+  `pure_deepagents`, `stable_harness_passthrough`,
+  `stable_harness_quality_gates`, and `stable_harness_recovery`.
+This design keeps DeepAgents execution semantics upstream-owned. Stable Harness
+only records, validates, replays, compares, and governs the run.
+## Supported Evaluation Shapes
+- LangSmith-style evals: final response, single-step/tool, and trajectory data
+  can be projected from `createLangSmithEvaluationTarget`.
+- BFCL-style tool evals: `QualityContract.tools.expected` validates tool
+  selection and argument subsets.
+- Tau-bench-style workflow evals: `QualityContract.workflow.finalStateChecks`
+  validates external environment state after a task.
+## Minimal Example
+```ts
+import {
+  createStandardRunRecord,
+  evaluateRunRecord,
+  runBenchmarkSuite,
+} from "@stable-harness/evaluation";
+const record = createStandardRunRecord({
+  run,
+  runtimeMode: "stable_harness_passthrough",
+});
+const pureDeepAgentsRecord = createExternalRunRecord({
+  requestId: "pure-1",
+  runtimeMode: "pure_deepagents",
+  input: "research task",
+  output: "done",
+  trajectory: [
+    { kind: "tool", name: "search", status: "completed", toolId: "search" },
+  ],
+});
+const evaluation = evaluateRunRecord({
+  record,
+  contract: {
+    requiredEvidence: { tools: ["search"] },
+    trajectory: {
+      mode: "ordered",
+      expected: [
+        { kind: "tool", toolId: "search", status: "started" },
+        { kind: "tool", toolId: "search", status: "completed" },
+      ],
+    },
+    workflow: {
+      finalStateChecks: [{ path: "reservation.status", equals: "confirmed" }],
+    },
+    controlStates: { preserveAsBlockers: true },
+  },
+  finalState,
+});
+```
+The same contract can be used in `runBenchmarkSuite` to compare pure DeepAgents
+against stable-harness runtime modes under the same model, tools, tasks, trials,
+and evaluator.

package/docs/guides/index.md CHANGED Viewed

@@ -16,6 +16,8 @@ embed it, operate it, or explain why it exists.
   portable Docker runtime with a generic persistent data mount.
 - [Quality gates](quality-gates.md): enable plan review, execution evidence
   review, and configured recovery loops without replacing upstream planning.
+- [Evaluation foundation](evaluation-foundation.md): normalize run records,
+  declare quality contracts, and compare DeepAgents runtime modes.
 - [Operator runbook](operator-runbook.md): validate a workspace, inspect
   events, run smoke tests, and keep the runtime operable.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "stable-harness",
-  "version": "0.0.51",
+  "version": "0.0.53",
   "type": "module",
   "description": "Stable application runtime and operator control plane for agent workspaces.",
   "license": "Apache-2.0",

package/packages/evaluation/dist/src/benchmark.d.ts ADDED Viewed

@@ -0,0 +1,51 @@
+import { type QualityContract, type StandardEvaluationReport } from "./evaluators.js";
+import type { BenchmarkRuntimeMode, StandardRunRecord } from "./run-record.js";
+export type BenchmarkTask = {
+    id: string;
+    input: string;
+    quality?: QualityContract;
+    referenceOutputs?: Record<string, unknown>;
+    finalState?: unknown;
+    metadata?: Record<string, unknown>;
+};
+export type BenchmarkRuntime = {
+    mode: BenchmarkRuntimeMode;
+    run(task: BenchmarkTask, trial: number): Promise<BenchmarkRunOutput> | BenchmarkRunOutput;
+};
+export type BenchmarkRunOutput = {
+    record: StandardRunRecord;
+    finalState?: unknown;
+};
+export type BenchmarkSuiteInput = {
+    suiteId: string;
+    tasks: BenchmarkTask[];
+    runtimes: BenchmarkRuntime[];
+    trials?: number;
+};
+export type BenchmarkSuiteReport = {
+    schemaVersion: 1;
+    kind: "stable-harness.benchmark-report";
+    suiteId: string;
+    createdAt: string;
+    trials: number;
+    results: BenchmarkTaskResult[];
+    summary: BenchmarkSummary[];
+};
+export type BenchmarkTaskResult = {
+    taskId: string;
+    trial: number;
+    runtimeMode: BenchmarkRuntimeMode;
+    record: StandardRunRecord;
+    evaluation: StandardEvaluationReport;
+};
+export type BenchmarkSummary = {
+    runtimeMode: BenchmarkRuntimeMode;
+    total: number;
+    passed: number;
+    failed: number;
+    blocked: number;
+    needsReview: number;
+    passRate: number;
+    averageScores: Record<string, number>;
+};
+export declare function runBenchmarkSuite(input: BenchmarkSuiteInput): Promise<BenchmarkSuiteReport>;

package/packages/evaluation/dist/src/benchmark.js ADDED Viewed

@@ -0,0 +1 @@

+ import{evaluateRunRecord as e}from"./evaluators.js";export async function runBenchmarkSuite(e){const t=e.trials??1,r=[];for(let a=0;a<t;a+=1)for(const t of e.tasks)for(const n of e.runtimes)r.push(await runTask(t,n,a));return{schemaVersion:1,kind:"stable-harness.benchmark-report",suiteId:e.suiteId,createdAt:(new Date).toISOString(),trials:t,results:r,summary:summarizeBenchmark(r)}}async function runTask(t,r,a){await void 0;const n=await r.run(t,a),o=e({record:n.record,contract:t.quality,finalState:n.finalState??t.finalState});return{taskId:t.id,trial:a,runtimeMode:r.mode,record:n.record,evaluation:o}}function summarizeBenchmark(e){const t=new Map;for(const r of e)t.set(r.runtimeMode,[...t.get(r.runtimeMode)??[],r]);return[...t].map(([e,t])=>function summarizeRuntime(e,t){const r=t.length,a=t.filter(e=>"pass"===e.evaluation.verdict).length;return{runtimeMode:e,total:r,passed:a,failed:t.filter(e=>"fail"===e.evaluation.verdict).length,blocked:t.filter(e=>"blocked"===e.evaluation.verdict).length,needsReview:t.filter(e=>"needs_review"===e.evaluation.verdict).length,passRate:r>0?a/r:0,averageScores:averageScores(t)}}(e,t))}function averageScores(e){const t=new Map;for(const r of e)for(const[e,a]of Object.entries(r.evaluation.scores))t.set(e,[...t.get(e)??[],a]);return Object.fromEntries([...t].map(([e,t])=>[e,t.reduce((e,t)=>e+t,0)/t.length]))}

package/packages/evaluation/dist/src/evaluators.d.ts ADDED Viewed

@@ -0,0 +1,68 @@
+import type { BenchmarkRuntimeMode, StandardRunRecord, StandardTrajectoryStep } from "./run-record.js";
+export type QualityContract = {
+    finalResponse?: {
+        rubric?: string;
+        requiredSubstrings?: string[];
+    };
+    requiredEvidence?: {
+        tools?: string[];
+    };
+    tools?: {
+        expected?: ExpectedToolCall[];
+        validateArguments?: boolean;
+    };
+    trajectory?: {
+        expected?: ExpectedTrajectoryStep[];
+        mode?: "any_order" | "ordered" | "judge";
+    };
+    workflow?: {
+        finalStateChecks?: WorkflowFinalStateCheck[];
+    };
+    controlStates?: {
+        preserveAsBlockers?: boolean;
+    };
+    approvals?: {
+        requiredFor?: string[];
+    };
+};
+export type ExpectedToolCall = {
+    toolId: string;
+    arguments?: Record<string, unknown>;
+};
+export type ExpectedTrajectoryStep = {
+    kind?: StandardTrajectoryStep["kind"];
+    name?: string;
+    toolId?: string;
+    subagentType?: string;
+    status?: StandardTrajectoryStep["status"];
+};
+export type WorkflowFinalStateCheck = {
+    path: string;
+    equals?: unknown;
+    includes?: unknown;
+    exists?: boolean;
+};
+export type EvaluationVerdict = "pass" | "fail" | "blocked" | "needs_review";
+export type StandardEvaluationReport = {
+    schemaVersion: 1;
+    kind: "stable-harness.evaluation-report";
+    requestId: string;
+    runtimeMode: BenchmarkRuntimeMode;
+    verdict: EvaluationVerdict;
+    scores: Record<string, number>;
+    checks: EvaluationCheck[];
+};
+export type EvaluationCheck = {
+    id: string;
+    category: "final_response" | "tool_call" | "trajectory" | "workflow" | "control_state" | "approval";
+    verdict: EvaluationVerdict;
+    message: string;
+    score: number;
+    expected?: unknown;
+    observed?: unknown;
+};
+export declare function evaluateRunRecord(input: {
+    record: StandardRunRecord;
+    contract?: QualityContract;
+    finalState?: unknown;
+}): StandardEvaluationReport;

package/packages/evaluation/dist/src/evaluators.js ADDED Viewed

@@ -0,0 +1 @@

+ export function evaluateRunRecord(e){const t=e.contract??{},r=[...evaluateFinalResponse(e.record,t),...evaluateRequiredEvidence(e.record,t),...evaluateToolCalls(e.record,t),...evaluateTrajectory(e.record,t),...evaluateWorkflow(e.finalState,t),...evaluateControlStates(e.record,t),...evaluateApprovals(e.record,t)];return{schemaVersion:1,kind:"stable-harness.evaluation-report",requestId:e.record.request.requestId,runtimeMode:e.record.runtimeMode,verdict:summarizeVerdict(r),scores:summarizeScores(r),checks:r}}function evaluateFinalResponse(e,t){const r=t.finalResponse?.requiredSubstrings??[];if(0===r.length)return[];const o=e.request.output??"",n=r.filter(e=>!o.includes(e));return[check("final_response.required_substrings","final_response",0===n.length,"final response contains required substrings",r,n)]}function evaluateRequiredEvidence(e,t){const r=t.requiredEvidence?.tools??[];if(0===r.length)return[];const o=new Set(function completedTools(e){return e.filter(e=>"completed"===e.status&&e.toolId).map(e=>e.toolId)}(e.trajectory)),n=r.filter(e=>!o.has(e));return[check("evidence.required_tools","tool_call",0===n.length,"required evidence tools completed",r,n)]}function evaluateToolCalls(e,t){const r=t.tools?.expected??[];return 0===r.length?[]:r.map((r,o)=>{const n=e.trajectory.find(e=>e.toolId===r.toolId&&"completed"===e.status),s=!t.tools?.validateArguments||!r.arguments||function subsetMatches(e,t){return!!isRecord(e)&&Object.entries(t).every(([t,r])=>deepEqual(e[t],r))}(n?.arguments,r.arguments);return check(`tool.expected.${o}`,"tool_call",Boolean(n)&&s,"expected tool call completed with valid arguments",r,n)})}function evaluateTrajectory(e,t){const r=t.trajectory?.expected??[];return 0===r.length?[]:"judge"===t.trajectory?.mode?[needsReview("trajectory.judge","trajectory","trajectory requires an external judge",r,e.trajectory)]:[check("trajectory.expected","trajectory","ordered"===t.trajectory?.mode?function orderedMatch(e,t){let r=0;for(const o of e)if(stepMatches(o,t[r])&&(r+=1),r===t.length)return!0;return 0===t.length}(e.trajectory,r):r.every(t=>e.trajectory.some(e=>stepMatches(e,t))),"expected trajectory steps matched",r,e.trajectory)]}function evaluateWorkflow(e,t){const r=t.workflow?.finalStateChecks??[];return 0===r.length?[]:r.map((t,r)=>{const o=function readPath(e,t){return t.split(".").filter(Boolean).reduce((e,t)=>isRecord(e)?e[t]:void 0,e)}(e,t.path),n=function finalStatePasses(e,t){return!(void 0!==t.exists&&t.exists!==(void 0!==e)||"equals"in t&&!deepEqual(e,t.equals)||"includes"in t&&!function includesValue(e,t){return Array.isArray(e)?e.some(e=>deepEqual(e,t)):String(e??"").includes(String(t))}(e,t.includes))}(o,t);return check(`workflow.final_state.${r}`,"workflow",n,"workflow final state check passed",t,o)})}function evaluateControlStates(e,t){if(!t.controlStates?.preserveAsBlockers)return[];const r=e.trajectory.filter(e=>"blocked"===e.status),o=e.request.output??"";return[check("control.blocker_preserved","control_state",0===r.length||r.some(e=>o.includes(e.name)),"blocked control states are visible in final output",r.map(e=>e.name),o)]}function evaluateApprovals(e,t){const r=t.approvals?.requiredFor??[];if(0===r.length)return[];const o=e.trajectory.filter(e=>"approval"===e.kind).map(e=>e.name);return[check("approval.required","approval",0===r.filter(e=>!o.some(t=>t.includes(e))).length,"required approval flow observed",r,o)]}function stepMatches(e,t){return!!t&&!(t.kind&&e.kind!==t.kind||t.name&&e.name!==t.name||t.toolId&&e.toolId!==t.toolId||t.subagentType&&e.subagentType!==t.subagentType||t.status&&e.status!==t.status)}function check(e,t,r,o,n,s){return{id:e,category:t,verdict:r?"pass":"fail",message:o,score:r?1:0,expected:n,observed:s}}function needsReview(e,t,r,o,n){return{id:e,category:t,verdict:"needs_review",message:r,score:0,expected:o,observed:n}}function summarizeVerdict(e){return e.some(e=>"blocked"===e.verdict)?"blocked":e.some(e=>"fail"===e.verdict)?"fail":e.some(e=>"needs_review"===e.verdict)?"needs_review":"pass"}function summarizeScores(e){const t=new Map;for(const r of e)t.set(r.category,[...t.get(r.category)??[],r]);return Object.fromEntries([...t].map(([e,t])=>[e,t.reduce((e,t)=>e+t.score,0)/t.length]))}function deepEqual(e,t){return JSON.stringify(e)===JSON.stringify(t)}function isRecord(e){return"object"==typeof e&&null!==e&&!Array.isArray(e)}

package/packages/evaluation/dist/src/index.d.ts CHANGED Viewed

@@ -1,4 +1,10 @@
 export { createTraceRecorder, createReplayManifest, createEvaluationBundle } from "./trace.js";
 export { computeToolCallMetrics } from "./tool-call-metrics.js";
+export { createStandardRunRecord, createExternalRunRecord, createLangSmithEvaluationTarget, projectTrajectory } from "./run-record.js";
+export { evaluateRunRecord } from "./evaluators.js";
+export { runBenchmarkSuite } from "./benchmark.js";
 export type { EvaluationBundle, ReplayManifest } from "./types.js";
 export type { PerToolCallMetrics, ToolCallMetrics } from "./tool-call-metrics.js";
+export type { BenchmarkRuntimeMode, ExternalRunRecordInput, LangSmithEvaluationTarget, StandardRunRecord, StandardTrajectoryStep, } from "./run-record.js";
+export type { EvaluationCheck, EvaluationVerdict, ExpectedToolCall, ExpectedTrajectoryStep, QualityContract, StandardEvaluationReport, WorkflowFinalStateCheck, } from "./evaluators.js";
+export type { BenchmarkRunOutput, BenchmarkRuntime, BenchmarkSuiteInput, BenchmarkSuiteReport, BenchmarkSummary, BenchmarkTask, BenchmarkTaskResult, } from "./benchmark.js";

package/packages/evaluation/dist/src/index.js CHANGED Viewed

	@@ -1 +1 @@
1	- export{createTraceRecorder,createReplayManifest,createEvaluationBundle}from"./trace.js";export{computeToolCallMetrics}from"./tool-call-metrics.js";
1	+ export{createTraceRecorder,createReplayManifest,createEvaluationBundle}from"./trace.js";export{computeToolCallMetrics}from"./tool-call-metrics.js";export{createStandardRunRecord,createExternalRunRecord,createLangSmithEvaluationTarget,projectTrajectory}from"./run-record.js";export{evaluateRunRecord}from"./evaluators.js";export{runBenchmarkSuite}from"./benchmark.js";

package/packages/evaluation/dist/src/run-record.d.ts ADDED Viewed

@@ -0,0 +1,68 @@
+import { type RuntimeEvent, type RuntimeRunRecord, type RuntimeTraceSpan } from "@stable-harness/core";
+export type BenchmarkRuntimeMode = "pure_deepagents" | "stable_harness_passthrough" | "stable_harness_quality_gates" | "stable_harness_recovery";
+export type StandardRunRecord = {
+    schemaVersion: 1;
+    kind: "stable-harness.run-record";
+    runtimeMode: BenchmarkRuntimeMode;
+    request: {
+        requestId: string;
+        sessionId: string;
+        agentId: string;
+        input: string;
+        state: RuntimeRunRecord["state"];
+        output?: string;
+        error?: string;
+    };
+    trajectory: StandardTrajectoryStep[];
+    spans: RuntimeTraceSpan[];
+    raw: {
+        events: RuntimeEvent[];
+        backendTrace?: unknown;
+    };
+    artifacts: RuntimeRunRecord["artifacts"];
+    metadata?: Record<string, unknown>;
+};
+export type StandardTrajectoryStep = {
+    index: number;
+    kind: "message" | "tool" | "subagent" | "approval" | "quality" | "control" | "artifact" | "other";
+    name: string;
+    status: "started" | "completed" | "failed" | "blocked" | "event";
+    toolId?: string;
+    subagentType?: string;
+    arguments?: unknown;
+    output?: unknown;
+    sourceEventType: RuntimeEvent["type"];
+    sourceEventId?: string;
+};
+export type LangSmithEvaluationTarget = {
+    inputs: Record<string, unknown>;
+    outputs: Record<string, unknown>;
+    referenceOutputs?: Record<string, unknown>;
+    metadata: Record<string, unknown>;
+};
+export type ExternalRunRecordInput = {
+    requestId: string;
+    sessionId?: string;
+    agentId?: string;
+    input: string;
+    output?: string;
+    error?: string;
+    state?: RuntimeRunRecord["state"];
+    runtimeMode: BenchmarkRuntimeMode;
+    trajectory?: Array<Omit<StandardTrajectoryStep, "index" | "sourceEventType"> & {
+        sourceEventType?: RuntimeEvent["type"];
+    }>;
+    events?: RuntimeEvent[];
+    backendTrace?: unknown;
+    artifacts?: RuntimeRunRecord["artifacts"];
+    metadata?: Record<string, unknown>;
+};
+export declare function createStandardRunRecord(input: {
+    run: RuntimeRunRecord;
+    runtimeMode?: BenchmarkRuntimeMode;
+    backendTrace?: unknown;
+    metadata?: Record<string, unknown>;
+}): StandardRunRecord;
+export declare function createExternalRunRecord(input: ExternalRunRecordInput): StandardRunRecord;
+export declare function createLangSmithEvaluationTarget(record: StandardRunRecord, referenceOutputs?: Record<string, unknown>): LangSmithEvaluationTarget;
+export declare function projectTrajectory(events: RuntimeEvent[]): StandardTrajectoryStep[];

package/packages/evaluation/dist/src/run-record.js ADDED Viewed

@@ -0,0 +1 @@

+ import{projectRuntimeTraceSpans as e}from"@stable-harness/core";export function createStandardRunRecord(t){const r=e(t.run);return{schemaVersion:1,kind:"stable-harness.run-record",runtimeMode:t.runtimeMode??"stable_harness_passthrough",request:{requestId:t.run.requestId,sessionId:t.run.sessionId,agentId:t.run.agentId,input:t.run.input,state:t.run.state,output:t.run.output,error:t.run.error},trajectory:projectTrajectory(t.run.events),spans:r,raw:{events:t.run.events,backendTrace:t.backendTrace},artifacts:t.run.artifacts,metadata:t.metadata??t.run.metadata}}export function createExternalRunRecord(e){const t=e.sessionId??e.requestId,r=e.agentId??"deepagents";return{schemaVersion:1,kind:"stable-harness.run-record",runtimeMode:e.runtimeMode,request:{requestId:e.requestId,sessionId:t,agentId:r,input:e.input,state:e.state??(e.error?"failed":"completed"),output:e.output,error:e.error},trajectory:normalizeExternalTrajectory(e.trajectory??[]),spans:[],raw:{events:e.events??[],backendTrace:e.backendTrace},artifacts:e.artifacts??[],metadata:e.metadata}}export function createLangSmithEvaluationTarget(e,t){return{inputs:{input:e.request.input},outputs:{output:e.request.output??"",trajectory:e.trajectory.map(e=>({kind:e.kind,name:e.name,status:e.status,toolId:e.toolId,subagentType:e.subagentType}))},referenceOutputs:t,metadata:{requestId:e.request.requestId,sessionId:e.request.sessionId,agentId:e.request.agentId,runtimeMode:e.runtimeMode,state:e.request.state}}}export function projectTrajectory(e){return e.map(projectStep).filter(isStep).map((e,t)=>({...e,index:t}))}function normalizeExternalTrajectory(e){return(e??[]).map((e,t)=>({...e,index:t,sourceEventType:e.sourceEventType??"runtime.adapter.event"}))}function projectStep(e){return"runtime.tool.direct.started"===e.type?toolStep(e,"started"):"runtime.tool.direct.completed"===e.type?toolStep(e,"completed",e.output):"runtime.tool.failure"===e.type?toolStep(e,"failed",e.error):"runtime.sandbox.decision"===e.type?namedStep(e,"control",`sandbox:${e.toolId}`,"event"):"runtime.tool.circuit.opened"===e.type?namedStep(e,"control",`circuit:${e.toolId}`,"blocked"):"runtime.execution.contract.failed"===e.type?namedStep(e,"quality",e.reason,"blocked"):e.type.startsWith("runtime.approval.")?namedStep(e,"approval",e.type,"blocked"):"runtime.artifact.created"===e.type?namedStep(e,"artifact",e.artifact.id,"event"):"runtime.adapter.event"===e.type?function adapterStep(e){const t=function isRecord(e){return"object"==typeof e&&null!==e&&!Array.isArray(e)}(e.event)?e.event:{};return function isToolStart(e){return"deepagents.tool_execution.start"===e.eventType||"agent.tool.start"===e.phase}(t)?adapterToolStep(e,t,"started"):function isToolResult(e){return"deepagents.tool_execution.result"===e.eventType||"agent.tool.result"===e.phase}(t)?adapterToolStep(e,t,"completed"):"delegation"===t.traceType?function delegationStep(e,t){const r=readString(t.traceLabel)?.endsWith(".completed")?"completed":"started";return{...namedStep(e,"subagent",readString(t.subagentType)??"task",r,t),toolId:"task",subagentType:readString(t.subagentType),arguments:t.taskInput??t.args,output:t.output}}(e,t):namedStep(e,"other",readString(t.phase)??readString(t.eventType)??"runtime.adapter.event","event",t)}(e):e.type.startsWith("runtime.quality.")?namedStep(e,"quality",e.type,"event"):e.type.startsWith("runtime.request.")?namedStep(e,"message",e.type,function requestStatus(e){return"runtime.request.completed"===e.type?"completed":"runtime.request.failed"===e.type?"failed":"runtime.request.cancelled"===e.type?"blocked":"started"}(e)):void 0}function adapterToolStep(e,t,r){const n=readString(t.toolId)??readString(t.name)??"unknown";return{...namedStep(e,"task"===n?"subagent":"tool",n,r,t),toolId:n,subagentType:readString(t.subagentType),arguments:t.args,output:t.output}}function toolStep(e,t,r){return{...namedStep(e,"tool",e.toolId,t),toolId:e.toolId,output:r}}function namedStep(e,t,r,n,a){return{kind:t,name:r,status:n,arguments:a,sourceEventType:e.type,sourceEventId:e.eventId}}function isStep(e){return void 0!==e}function readString(e){return"string"==typeof e&&e.trim()?e:void 0}