stable-harness 0.0.51 → 0.0.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -185,10 +185,10 @@ Fast matrix results:
185
185
  | Ollama | `qwen3.5:9b` | native auto tools | 15/15 | none |
186
186
  | Ollama | `qwen3.5:0.8b` | native auto tools | 14/15 | `freshness` day instead of month |
187
187
  | Ollama | `granite4.1:3b` | native auto tools | 14/15 | Chinese news query became unnatural text |
188
- | Ollama | `qwen3:latest` | native auto tools | 14/15 | Workday ticker became `WORK` |
188
+ | Ollama | `qwen3:latest` | native auto tools | 14/15 | ExampleCo ticker became `EXCO` |
189
189
  | Ollama | `qwen2.5:7b-instruct` | native auto tools | 14/15 | path with space was collapsed |
190
- | Ollama | `gemma4:e2b` | native auto tools | 14/15 | missed Chinese Workday stock tool call |
191
- | Ollama | `gemma4:e4b` | native auto tools | 14/15 | missed Chinese Workday stock tool call |
190
+ | Ollama | `gemma4:e2b` | native auto tools | 14/15 | missed Chinese ExampleCo stock tool call |
191
+ | Ollama | `gemma4:e4b` | native auto tools | 14/15 | missed Chinese ExampleCo stock tool call |
192
192
  | Ollama | `lfm2.5-thinking:latest` | native auto tools | 13/15 | namespace typo and HK market error |
193
193
  | Ollama | `qwen3:0.6b` | native auto tools | 12/15 | one timeout, weaker exact title/query handling |
194
194
  | Ollama | `gpt-oss:latest` | native auto tools | 12/15 | path and freshness enum errors |
@@ -0,0 +1,72 @@
1
+ # Evaluation Foundation
2
+
3
+ Stable Harness evaluates DeepAgents workloads by recording facts first, then
4
+ applying benchmark-neutral quality contracts. The foundation has three objects:
5
+
6
+ - `StandardRunRecord`: a normalized record for stable-harness and pure
7
+ DeepAgents runs.
8
+ - `QualityContract`: workspace-declared success criteria for final response,
9
+ tool calls, trajectory, workflow final state, control states, and approvals.
10
+ - `BenchmarkSuiteReport`: a comparable report across runtime modes such as
11
+ `pure_deepagents`, `stable_harness_passthrough`,
12
+ `stable_harness_quality_gates`, and `stable_harness_recovery`.
13
+
14
+ This design keeps DeepAgents execution semantics upstream-owned. Stable Harness
15
+ only records, validates, replays, compares, and governs the run.
16
+
17
+ ## Supported Evaluation Shapes
18
+
19
+ - LangSmith-style evals: final response, single-step/tool, and trajectory data
20
+ can be projected from `createLangSmithEvaluationTarget`.
21
+ - BFCL-style tool evals: `QualityContract.tools.expected` validates tool
22
+ selection and argument subsets.
23
+ - Tau-bench-style workflow evals: `QualityContract.workflow.finalStateChecks`
24
+ validates external environment state after a task.
25
+
26
+ ## Minimal Example
27
+
28
+ ```ts
29
+ import {
30
+ createStandardRunRecord,
31
+ evaluateRunRecord,
32
+ runBenchmarkSuite,
33
+ } from "@stable-harness/evaluation";
34
+
35
+ const record = createStandardRunRecord({
36
+ run,
37
+ runtimeMode: "stable_harness_passthrough",
38
+ });
39
+
40
+ const pureDeepAgentsRecord = createExternalRunRecord({
41
+ requestId: "pure-1",
42
+ runtimeMode: "pure_deepagents",
43
+ input: "research task",
44
+ output: "done",
45
+ trajectory: [
46
+ { kind: "tool", name: "search", status: "completed", toolId: "search" },
47
+ ],
48
+ });
49
+
50
+ const evaluation = evaluateRunRecord({
51
+ record,
52
+ contract: {
53
+ requiredEvidence: { tools: ["search"] },
54
+ trajectory: {
55
+ mode: "ordered",
56
+ expected: [
57
+ { kind: "tool", toolId: "search", status: "started" },
58
+ { kind: "tool", toolId: "search", status: "completed" },
59
+ ],
60
+ },
61
+ workflow: {
62
+ finalStateChecks: [{ path: "reservation.status", equals: "confirmed" }],
63
+ },
64
+ controlStates: { preserveAsBlockers: true },
65
+ },
66
+ finalState,
67
+ });
68
+ ```
69
+
70
+ The same contract can be used in `runBenchmarkSuite` to compare pure DeepAgents
71
+ against stable-harness runtime modes under the same model, tools, tasks, trials,
72
+ and evaluator.
@@ -16,6 +16,8 @@ embed it, operate it, or explain why it exists.
16
16
  portable Docker runtime with a generic persistent data mount.
17
17
  - [Quality gates](quality-gates.md): enable plan review, execution evidence
18
18
  review, and configured recovery loops without replacing upstream planning.
19
+ - [Evaluation foundation](evaluation-foundation.md): normalize run records,
20
+ declare quality contracts, and compare DeepAgents runtime modes.
19
21
  - [Operator runbook](operator-runbook.md): validate a workspace, inspect
20
22
  events, run smoke tests, and keep the runtime operable.
21
23
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "stable-harness",
3
- "version": "0.0.51",
3
+ "version": "0.0.53",
4
4
  "type": "module",
5
5
  "description": "Stable application runtime and operator control plane for agent workspaces.",
6
6
  "license": "Apache-2.0",
@@ -0,0 +1,51 @@
1
+ import { type QualityContract, type StandardEvaluationReport } from "./evaluators.js";
2
+ import type { BenchmarkRuntimeMode, StandardRunRecord } from "./run-record.js";
3
+ export type BenchmarkTask = {
4
+ id: string;
5
+ input: string;
6
+ quality?: QualityContract;
7
+ referenceOutputs?: Record<string, unknown>;
8
+ finalState?: unknown;
9
+ metadata?: Record<string, unknown>;
10
+ };
11
+ export type BenchmarkRuntime = {
12
+ mode: BenchmarkRuntimeMode;
13
+ run(task: BenchmarkTask, trial: number): Promise<BenchmarkRunOutput> | BenchmarkRunOutput;
14
+ };
15
+ export type BenchmarkRunOutput = {
16
+ record: StandardRunRecord;
17
+ finalState?: unknown;
18
+ };
19
+ export type BenchmarkSuiteInput = {
20
+ suiteId: string;
21
+ tasks: BenchmarkTask[];
22
+ runtimes: BenchmarkRuntime[];
23
+ trials?: number;
24
+ };
25
+ export type BenchmarkSuiteReport = {
26
+ schemaVersion: 1;
27
+ kind: "stable-harness.benchmark-report";
28
+ suiteId: string;
29
+ createdAt: string;
30
+ trials: number;
31
+ results: BenchmarkTaskResult[];
32
+ summary: BenchmarkSummary[];
33
+ };
34
+ export type BenchmarkTaskResult = {
35
+ taskId: string;
36
+ trial: number;
37
+ runtimeMode: BenchmarkRuntimeMode;
38
+ record: StandardRunRecord;
39
+ evaluation: StandardEvaluationReport;
40
+ };
41
+ export type BenchmarkSummary = {
42
+ runtimeMode: BenchmarkRuntimeMode;
43
+ total: number;
44
+ passed: number;
45
+ failed: number;
46
+ blocked: number;
47
+ needsReview: number;
48
+ passRate: number;
49
+ averageScores: Record<string, number>;
50
+ };
51
+ export declare function runBenchmarkSuite(input: BenchmarkSuiteInput): Promise<BenchmarkSuiteReport>;
@@ -0,0 +1 @@
1
+ import{evaluateRunRecord as e}from"./evaluators.js";export async function runBenchmarkSuite(e){const t=e.trials??1,r=[];for(let a=0;a<t;a+=1)for(const t of e.tasks)for(const n of e.runtimes)r.push(await runTask(t,n,a));return{schemaVersion:1,kind:"stable-harness.benchmark-report",suiteId:e.suiteId,createdAt:(new Date).toISOString(),trials:t,results:r,summary:summarizeBenchmark(r)}}async function runTask(t,r,a){await void 0;const n=await r.run(t,a),o=e({record:n.record,contract:t.quality,finalState:n.finalState??t.finalState});return{taskId:t.id,trial:a,runtimeMode:r.mode,record:n.record,evaluation:o}}function summarizeBenchmark(e){const t=new Map;for(const r of e)t.set(r.runtimeMode,[...t.get(r.runtimeMode)??[],r]);return[...t].map(([e,t])=>function summarizeRuntime(e,t){const r=t.length,a=t.filter(e=>"pass"===e.evaluation.verdict).length;return{runtimeMode:e,total:r,passed:a,failed:t.filter(e=>"fail"===e.evaluation.verdict).length,blocked:t.filter(e=>"blocked"===e.evaluation.verdict).length,needsReview:t.filter(e=>"needs_review"===e.evaluation.verdict).length,passRate:r>0?a/r:0,averageScores:averageScores(t)}}(e,t))}function averageScores(e){const t=new Map;for(const r of e)for(const[e,a]of Object.entries(r.evaluation.scores))t.set(e,[...t.get(e)??[],a]);return Object.fromEntries([...t].map(([e,t])=>[e,t.reduce((e,t)=>e+t,0)/t.length]))}
@@ -0,0 +1,68 @@
1
+ import type { BenchmarkRuntimeMode, StandardRunRecord, StandardTrajectoryStep } from "./run-record.js";
2
+ export type QualityContract = {
3
+ finalResponse?: {
4
+ rubric?: string;
5
+ requiredSubstrings?: string[];
6
+ };
7
+ requiredEvidence?: {
8
+ tools?: string[];
9
+ };
10
+ tools?: {
11
+ expected?: ExpectedToolCall[];
12
+ validateArguments?: boolean;
13
+ };
14
+ trajectory?: {
15
+ expected?: ExpectedTrajectoryStep[];
16
+ mode?: "any_order" | "ordered" | "judge";
17
+ };
18
+ workflow?: {
19
+ finalStateChecks?: WorkflowFinalStateCheck[];
20
+ };
21
+ controlStates?: {
22
+ preserveAsBlockers?: boolean;
23
+ };
24
+ approvals?: {
25
+ requiredFor?: string[];
26
+ };
27
+ };
28
+ export type ExpectedToolCall = {
29
+ toolId: string;
30
+ arguments?: Record<string, unknown>;
31
+ };
32
+ export type ExpectedTrajectoryStep = {
33
+ kind?: StandardTrajectoryStep["kind"];
34
+ name?: string;
35
+ toolId?: string;
36
+ subagentType?: string;
37
+ status?: StandardTrajectoryStep["status"];
38
+ };
39
+ export type WorkflowFinalStateCheck = {
40
+ path: string;
41
+ equals?: unknown;
42
+ includes?: unknown;
43
+ exists?: boolean;
44
+ };
45
+ export type EvaluationVerdict = "pass" | "fail" | "blocked" | "needs_review";
46
+ export type StandardEvaluationReport = {
47
+ schemaVersion: 1;
48
+ kind: "stable-harness.evaluation-report";
49
+ requestId: string;
50
+ runtimeMode: BenchmarkRuntimeMode;
51
+ verdict: EvaluationVerdict;
52
+ scores: Record<string, number>;
53
+ checks: EvaluationCheck[];
54
+ };
55
+ export type EvaluationCheck = {
56
+ id: string;
57
+ category: "final_response" | "tool_call" | "trajectory" | "workflow" | "control_state" | "approval";
58
+ verdict: EvaluationVerdict;
59
+ message: string;
60
+ score: number;
61
+ expected?: unknown;
62
+ observed?: unknown;
63
+ };
64
+ export declare function evaluateRunRecord(input: {
65
+ record: StandardRunRecord;
66
+ contract?: QualityContract;
67
+ finalState?: unknown;
68
+ }): StandardEvaluationReport;
@@ -0,0 +1 @@
1
+ export function evaluateRunRecord(e){const t=e.contract??{},r=[...evaluateFinalResponse(e.record,t),...evaluateRequiredEvidence(e.record,t),...evaluateToolCalls(e.record,t),...evaluateTrajectory(e.record,t),...evaluateWorkflow(e.finalState,t),...evaluateControlStates(e.record,t),...evaluateApprovals(e.record,t)];return{schemaVersion:1,kind:"stable-harness.evaluation-report",requestId:e.record.request.requestId,runtimeMode:e.record.runtimeMode,verdict:summarizeVerdict(r),scores:summarizeScores(r),checks:r}}function evaluateFinalResponse(e,t){const r=t.finalResponse?.requiredSubstrings??[];if(0===r.length)return[];const o=e.request.output??"",n=r.filter(e=>!o.includes(e));return[check("final_response.required_substrings","final_response",0===n.length,"final response contains required substrings",r,n)]}function evaluateRequiredEvidence(e,t){const r=t.requiredEvidence?.tools??[];if(0===r.length)return[];const o=new Set(function completedTools(e){return e.filter(e=>"completed"===e.status&&e.toolId).map(e=>e.toolId)}(e.trajectory)),n=r.filter(e=>!o.has(e));return[check("evidence.required_tools","tool_call",0===n.length,"required evidence tools completed",r,n)]}function evaluateToolCalls(e,t){const r=t.tools?.expected??[];return 0===r.length?[]:r.map((r,o)=>{const n=e.trajectory.find(e=>e.toolId===r.toolId&&"completed"===e.status),s=!t.tools?.validateArguments||!r.arguments||function subsetMatches(e,t){return!!isRecord(e)&&Object.entries(t).every(([t,r])=>deepEqual(e[t],r))}(n?.arguments,r.arguments);return check(`tool.expected.${o}`,"tool_call",Boolean(n)&&s,"expected tool call completed with valid arguments",r,n)})}function evaluateTrajectory(e,t){const r=t.trajectory?.expected??[];return 0===r.length?[]:"judge"===t.trajectory?.mode?[needsReview("trajectory.judge","trajectory","trajectory requires an external judge",r,e.trajectory)]:[check("trajectory.expected","trajectory","ordered"===t.trajectory?.mode?function orderedMatch(e,t){let r=0;for(const o of e)if(stepMatches(o,t[r])&&(r+=1),r===t.length)return!0;return 0===t.length}(e.trajectory,r):r.every(t=>e.trajectory.some(e=>stepMatches(e,t))),"expected trajectory steps matched",r,e.trajectory)]}function evaluateWorkflow(e,t){const r=t.workflow?.finalStateChecks??[];return 0===r.length?[]:r.map((t,r)=>{const o=function readPath(e,t){return t.split(".").filter(Boolean).reduce((e,t)=>isRecord(e)?e[t]:void 0,e)}(e,t.path),n=function finalStatePasses(e,t){return!(void 0!==t.exists&&t.exists!==(void 0!==e)||"equals"in t&&!deepEqual(e,t.equals)||"includes"in t&&!function includesValue(e,t){return Array.isArray(e)?e.some(e=>deepEqual(e,t)):String(e??"").includes(String(t))}(e,t.includes))}(o,t);return check(`workflow.final_state.${r}`,"workflow",n,"workflow final state check passed",t,o)})}function evaluateControlStates(e,t){if(!t.controlStates?.preserveAsBlockers)return[];const r=e.trajectory.filter(e=>"blocked"===e.status),o=e.request.output??"";return[check("control.blocker_preserved","control_state",0===r.length||r.some(e=>o.includes(e.name)),"blocked control states are visible in final output",r.map(e=>e.name),o)]}function evaluateApprovals(e,t){const r=t.approvals?.requiredFor??[];if(0===r.length)return[];const o=e.trajectory.filter(e=>"approval"===e.kind).map(e=>e.name);return[check("approval.required","approval",0===r.filter(e=>!o.some(t=>t.includes(e))).length,"required approval flow observed",r,o)]}function stepMatches(e,t){return!!t&&!(t.kind&&e.kind!==t.kind||t.name&&e.name!==t.name||t.toolId&&e.toolId!==t.toolId||t.subagentType&&e.subagentType!==t.subagentType||t.status&&e.status!==t.status)}function check(e,t,r,o,n,s){return{id:e,category:t,verdict:r?"pass":"fail",message:o,score:r?1:0,expected:n,observed:s}}function needsReview(e,t,r,o,n){return{id:e,category:t,verdict:"needs_review",message:r,score:0,expected:o,observed:n}}function summarizeVerdict(e){return e.some(e=>"blocked"===e.verdict)?"blocked":e.some(e=>"fail"===e.verdict)?"fail":e.some(e=>"needs_review"===e.verdict)?"needs_review":"pass"}function summarizeScores(e){const t=new Map;for(const r of e)t.set(r.category,[...t.get(r.category)??[],r]);return Object.fromEntries([...t].map(([e,t])=>[e,t.reduce((e,t)=>e+t.score,0)/t.length]))}function deepEqual(e,t){return JSON.stringify(e)===JSON.stringify(t)}function isRecord(e){return"object"==typeof e&&null!==e&&!Array.isArray(e)}
@@ -1,4 +1,10 @@
1
1
  export { createTraceRecorder, createReplayManifest, createEvaluationBundle } from "./trace.js";
2
2
  export { computeToolCallMetrics } from "./tool-call-metrics.js";
3
+ export { createStandardRunRecord, createExternalRunRecord, createLangSmithEvaluationTarget, projectTrajectory } from "./run-record.js";
4
+ export { evaluateRunRecord } from "./evaluators.js";
5
+ export { runBenchmarkSuite } from "./benchmark.js";
3
6
  export type { EvaluationBundle, ReplayManifest } from "./types.js";
4
7
  export type { PerToolCallMetrics, ToolCallMetrics } from "./tool-call-metrics.js";
8
+ export type { BenchmarkRuntimeMode, ExternalRunRecordInput, LangSmithEvaluationTarget, StandardRunRecord, StandardTrajectoryStep, } from "./run-record.js";
9
+ export type { EvaluationCheck, EvaluationVerdict, ExpectedToolCall, ExpectedTrajectoryStep, QualityContract, StandardEvaluationReport, WorkflowFinalStateCheck, } from "./evaluators.js";
10
+ export type { BenchmarkRunOutput, BenchmarkRuntime, BenchmarkSuiteInput, BenchmarkSuiteReport, BenchmarkSummary, BenchmarkTask, BenchmarkTaskResult, } from "./benchmark.js";
@@ -1 +1 @@
1
- export{createTraceRecorder,createReplayManifest,createEvaluationBundle}from"./trace.js";export{computeToolCallMetrics}from"./tool-call-metrics.js";
1
+ export{createTraceRecorder,createReplayManifest,createEvaluationBundle}from"./trace.js";export{computeToolCallMetrics}from"./tool-call-metrics.js";export{createStandardRunRecord,createExternalRunRecord,createLangSmithEvaluationTarget,projectTrajectory}from"./run-record.js";export{evaluateRunRecord}from"./evaluators.js";export{runBenchmarkSuite}from"./benchmark.js";
@@ -0,0 +1,68 @@
1
+ import { type RuntimeEvent, type RuntimeRunRecord, type RuntimeTraceSpan } from "@stable-harness/core";
2
+ export type BenchmarkRuntimeMode = "pure_deepagents" | "stable_harness_passthrough" | "stable_harness_quality_gates" | "stable_harness_recovery";
3
+ export type StandardRunRecord = {
4
+ schemaVersion: 1;
5
+ kind: "stable-harness.run-record";
6
+ runtimeMode: BenchmarkRuntimeMode;
7
+ request: {
8
+ requestId: string;
9
+ sessionId: string;
10
+ agentId: string;
11
+ input: string;
12
+ state: RuntimeRunRecord["state"];
13
+ output?: string;
14
+ error?: string;
15
+ };
16
+ trajectory: StandardTrajectoryStep[];
17
+ spans: RuntimeTraceSpan[];
18
+ raw: {
19
+ events: RuntimeEvent[];
20
+ backendTrace?: unknown;
21
+ };
22
+ artifacts: RuntimeRunRecord["artifacts"];
23
+ metadata?: Record<string, unknown>;
24
+ };
25
+ export type StandardTrajectoryStep = {
26
+ index: number;
27
+ kind: "message" | "tool" | "subagent" | "approval" | "quality" | "control" | "artifact" | "other";
28
+ name: string;
29
+ status: "started" | "completed" | "failed" | "blocked" | "event";
30
+ toolId?: string;
31
+ subagentType?: string;
32
+ arguments?: unknown;
33
+ output?: unknown;
34
+ sourceEventType: RuntimeEvent["type"];
35
+ sourceEventId?: string;
36
+ };
37
+ export type LangSmithEvaluationTarget = {
38
+ inputs: Record<string, unknown>;
39
+ outputs: Record<string, unknown>;
40
+ referenceOutputs?: Record<string, unknown>;
41
+ metadata: Record<string, unknown>;
42
+ };
43
+ export type ExternalRunRecordInput = {
44
+ requestId: string;
45
+ sessionId?: string;
46
+ agentId?: string;
47
+ input: string;
48
+ output?: string;
49
+ error?: string;
50
+ state?: RuntimeRunRecord["state"];
51
+ runtimeMode: BenchmarkRuntimeMode;
52
+ trajectory?: Array<Omit<StandardTrajectoryStep, "index" | "sourceEventType"> & {
53
+ sourceEventType?: RuntimeEvent["type"];
54
+ }>;
55
+ events?: RuntimeEvent[];
56
+ backendTrace?: unknown;
57
+ artifacts?: RuntimeRunRecord["artifacts"];
58
+ metadata?: Record<string, unknown>;
59
+ };
60
+ export declare function createStandardRunRecord(input: {
61
+ run: RuntimeRunRecord;
62
+ runtimeMode?: BenchmarkRuntimeMode;
63
+ backendTrace?: unknown;
64
+ metadata?: Record<string, unknown>;
65
+ }): StandardRunRecord;
66
+ export declare function createExternalRunRecord(input: ExternalRunRecordInput): StandardRunRecord;
67
+ export declare function createLangSmithEvaluationTarget(record: StandardRunRecord, referenceOutputs?: Record<string, unknown>): LangSmithEvaluationTarget;
68
+ export declare function projectTrajectory(events: RuntimeEvent[]): StandardTrajectoryStep[];
@@ -0,0 +1 @@
1
+ import{projectRuntimeTraceSpans as e}from"@stable-harness/core";export function createStandardRunRecord(t){const r=e(t.run);return{schemaVersion:1,kind:"stable-harness.run-record",runtimeMode:t.runtimeMode??"stable_harness_passthrough",request:{requestId:t.run.requestId,sessionId:t.run.sessionId,agentId:t.run.agentId,input:t.run.input,state:t.run.state,output:t.run.output,error:t.run.error},trajectory:projectTrajectory(t.run.events),spans:r,raw:{events:t.run.events,backendTrace:t.backendTrace},artifacts:t.run.artifacts,metadata:t.metadata??t.run.metadata}}export function createExternalRunRecord(e){const t=e.sessionId??e.requestId,r=e.agentId??"deepagents";return{schemaVersion:1,kind:"stable-harness.run-record",runtimeMode:e.runtimeMode,request:{requestId:e.requestId,sessionId:t,agentId:r,input:e.input,state:e.state??(e.error?"failed":"completed"),output:e.output,error:e.error},trajectory:normalizeExternalTrajectory(e.trajectory??[]),spans:[],raw:{events:e.events??[],backendTrace:e.backendTrace},artifacts:e.artifacts??[],metadata:e.metadata}}export function createLangSmithEvaluationTarget(e,t){return{inputs:{input:e.request.input},outputs:{output:e.request.output??"",trajectory:e.trajectory.map(e=>({kind:e.kind,name:e.name,status:e.status,toolId:e.toolId,subagentType:e.subagentType}))},referenceOutputs:t,metadata:{requestId:e.request.requestId,sessionId:e.request.sessionId,agentId:e.request.agentId,runtimeMode:e.runtimeMode,state:e.request.state}}}export function projectTrajectory(e){return e.map(projectStep).filter(isStep).map((e,t)=>({...e,index:t}))}function normalizeExternalTrajectory(e){return(e??[]).map((e,t)=>({...e,index:t,sourceEventType:e.sourceEventType??"runtime.adapter.event"}))}function projectStep(e){return"runtime.tool.direct.started"===e.type?toolStep(e,"started"):"runtime.tool.direct.completed"===e.type?toolStep(e,"completed",e.output):"runtime.tool.failure"===e.type?toolStep(e,"failed",e.error):"runtime.sandbox.decision"===e.type?namedStep(e,"control",`sandbox:${e.toolId}`,"event"):"runtime.tool.circuit.opened"===e.type?namedStep(e,"control",`circuit:${e.toolId}`,"blocked"):"runtime.execution.contract.failed"===e.type?namedStep(e,"quality",e.reason,"blocked"):e.type.startsWith("runtime.approval.")?namedStep(e,"approval",e.type,"blocked"):"runtime.artifact.created"===e.type?namedStep(e,"artifact",e.artifact.id,"event"):"runtime.adapter.event"===e.type?function adapterStep(e){const t=function isRecord(e){return"object"==typeof e&&null!==e&&!Array.isArray(e)}(e.event)?e.event:{};return function isToolStart(e){return"deepagents.tool_execution.start"===e.eventType||"agent.tool.start"===e.phase}(t)?adapterToolStep(e,t,"started"):function isToolResult(e){return"deepagents.tool_execution.result"===e.eventType||"agent.tool.result"===e.phase}(t)?adapterToolStep(e,t,"completed"):"delegation"===t.traceType?function delegationStep(e,t){const r=readString(t.traceLabel)?.endsWith(".completed")?"completed":"started";return{...namedStep(e,"subagent",readString(t.subagentType)??"task",r,t),toolId:"task",subagentType:readString(t.subagentType),arguments:t.taskInput??t.args,output:t.output}}(e,t):namedStep(e,"other",readString(t.phase)??readString(t.eventType)??"runtime.adapter.event","event",t)}(e):e.type.startsWith("runtime.quality.")?namedStep(e,"quality",e.type,"event"):e.type.startsWith("runtime.request.")?namedStep(e,"message",e.type,function requestStatus(e){return"runtime.request.completed"===e.type?"completed":"runtime.request.failed"===e.type?"failed":"runtime.request.cancelled"===e.type?"blocked":"started"}(e)):void 0}function adapterToolStep(e,t,r){const n=readString(t.toolId)??readString(t.name)??"unknown";return{...namedStep(e,"task"===n?"subagent":"tool",n,r,t),toolId:n,subagentType:readString(t.subagentType),arguments:t.args,output:t.output}}function toolStep(e,t,r){return{...namedStep(e,"tool",e.toolId,t),toolId:e.toolId,output:r}}function namedStep(e,t,r,n,a){return{kind:t,name:r,status:n,arguments:a,sourceEventType:e.type,sourceEventId:e.eventId}}function isStep(e){return void 0!==e}function readString(e){return"string"==typeof e&&e.trim()?e:void 0}