stable-harness 0.0.51 → 0.0.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/granite-tool-calling-comparison.zh.md +3 -3
- package/docs/guides/evaluation-foundation.md +72 -0
- package/docs/guides/index.md +2 -0
- package/package.json +1 -1
- package/packages/evaluation/dist/src/benchmark.d.ts +51 -0
- package/packages/evaluation/dist/src/benchmark.js +1 -0
- package/packages/evaluation/dist/src/evaluators.d.ts +68 -0
- package/packages/evaluation/dist/src/evaluators.js +1 -0
- package/packages/evaluation/dist/src/index.d.ts +6 -0
- package/packages/evaluation/dist/src/index.js +1 -1
- package/packages/evaluation/dist/src/run-record.d.ts +68 -0
- package/packages/evaluation/dist/src/run-record.js +1 -0
|
@@ -185,10 +185,10 @@ Fast matrix results:
|
|
|
185
185
|
| Ollama | `qwen3.5:9b` | native auto tools | 15/15 | none |
|
|
186
186
|
| Ollama | `qwen3.5:0.8b` | native auto tools | 14/15 | `freshness` day instead of month |
|
|
187
187
|
| Ollama | `granite4.1:3b` | native auto tools | 14/15 | Chinese news query became unnatural text |
|
|
188
|
-
| Ollama | `qwen3:latest` | native auto tools | 14/15 |
|
|
188
|
+
| Ollama | `qwen3:latest` | native auto tools | 14/15 | ExampleCo ticker became `EXCO` |
|
|
189
189
|
| Ollama | `qwen2.5:7b-instruct` | native auto tools | 14/15 | path with space was collapsed |
|
|
190
|
-
| Ollama | `gemma4:e2b` | native auto tools | 14/15 | missed Chinese
|
|
191
|
-
| Ollama | `gemma4:e4b` | native auto tools | 14/15 | missed Chinese
|
|
190
|
+
| Ollama | `gemma4:e2b` | native auto tools | 14/15 | missed Chinese ExampleCo stock tool call |
|
|
191
|
+
| Ollama | `gemma4:e4b` | native auto tools | 14/15 | missed Chinese ExampleCo stock tool call |
|
|
192
192
|
| Ollama | `lfm2.5-thinking:latest` | native auto tools | 13/15 | namespace typo and HK market error |
|
|
193
193
|
| Ollama | `qwen3:0.6b` | native auto tools | 12/15 | one timeout, weaker exact title/query handling |
|
|
194
194
|
| Ollama | `gpt-oss:latest` | native auto tools | 12/15 | path and freshness enum errors |
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# Evaluation Foundation
|
|
2
|
+
|
|
3
|
+
Stable Harness evaluates DeepAgents workloads by recording facts first, then
|
|
4
|
+
applying benchmark-neutral quality contracts. The foundation has three objects:
|
|
5
|
+
|
|
6
|
+
- `StandardRunRecord`: a normalized record for stable-harness and pure
|
|
7
|
+
DeepAgents runs.
|
|
8
|
+
- `QualityContract`: workspace-declared success criteria for final response,
|
|
9
|
+
tool calls, trajectory, workflow final state, control states, and approvals.
|
|
10
|
+
- `BenchmarkSuiteReport`: a comparable report across runtime modes such as
|
|
11
|
+
`pure_deepagents`, `stable_harness_passthrough`,
|
|
12
|
+
`stable_harness_quality_gates`, and `stable_harness_recovery`.
|
|
13
|
+
|
|
14
|
+
This design keeps DeepAgents execution semantics upstream-owned. Stable Harness
|
|
15
|
+
only records, validates, replays, compares, and governs the run.
|
|
16
|
+
|
|
17
|
+
## Supported Evaluation Shapes
|
|
18
|
+
|
|
19
|
+
- LangSmith-style evals: final response, single-step/tool, and trajectory data
|
|
20
|
+
can be projected from `createLangSmithEvaluationTarget`.
|
|
21
|
+
- BFCL-style tool evals: `QualityContract.tools.expected` validates tool
|
|
22
|
+
selection and argument subsets.
|
|
23
|
+
- Tau-bench-style workflow evals: `QualityContract.workflow.finalStateChecks`
|
|
24
|
+
validates external environment state after a task.
|
|
25
|
+
|
|
26
|
+
## Minimal Example
|
|
27
|
+
|
|
28
|
+
```ts
|
|
29
|
+
import {
|
|
30
|
+
createStandardRunRecord,
|
|
31
|
+
evaluateRunRecord,
|
|
32
|
+
runBenchmarkSuite,
|
|
33
|
+
} from "@stable-harness/evaluation";
|
|
34
|
+
|
|
35
|
+
const record = createStandardRunRecord({
|
|
36
|
+
run,
|
|
37
|
+
runtimeMode: "stable_harness_passthrough",
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
const pureDeepAgentsRecord = createExternalRunRecord({
|
|
41
|
+
requestId: "pure-1",
|
|
42
|
+
runtimeMode: "pure_deepagents",
|
|
43
|
+
input: "research task",
|
|
44
|
+
output: "done",
|
|
45
|
+
trajectory: [
|
|
46
|
+
{ kind: "tool", name: "search", status: "completed", toolId: "search" },
|
|
47
|
+
],
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
const evaluation = evaluateRunRecord({
|
|
51
|
+
record,
|
|
52
|
+
contract: {
|
|
53
|
+
requiredEvidence: { tools: ["search"] },
|
|
54
|
+
trajectory: {
|
|
55
|
+
mode: "ordered",
|
|
56
|
+
expected: [
|
|
57
|
+
{ kind: "tool", toolId: "search", status: "started" },
|
|
58
|
+
{ kind: "tool", toolId: "search", status: "completed" },
|
|
59
|
+
],
|
|
60
|
+
},
|
|
61
|
+
workflow: {
|
|
62
|
+
finalStateChecks: [{ path: "reservation.status", equals: "confirmed" }],
|
|
63
|
+
},
|
|
64
|
+
controlStates: { preserveAsBlockers: true },
|
|
65
|
+
},
|
|
66
|
+
finalState,
|
|
67
|
+
});
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
The same contract can be used in `runBenchmarkSuite` to compare pure DeepAgents
|
|
71
|
+
against stable-harness runtime modes under the same model, tools, tasks, trials,
|
|
72
|
+
and evaluator.
|
package/docs/guides/index.md
CHANGED
|
@@ -16,6 +16,8 @@ embed it, operate it, or explain why it exists.
|
|
|
16
16
|
portable Docker runtime with a generic persistent data mount.
|
|
17
17
|
- [Quality gates](quality-gates.md): enable plan review, execution evidence
|
|
18
18
|
review, and configured recovery loops without replacing upstream planning.
|
|
19
|
+
- [Evaluation foundation](evaluation-foundation.md): normalize run records,
|
|
20
|
+
declare quality contracts, and compare DeepAgents runtime modes.
|
|
19
21
|
- [Operator runbook](operator-runbook.md): validate a workspace, inspect
|
|
20
22
|
events, run smoke tests, and keep the runtime operable.
|
|
21
23
|
|
package/package.json
CHANGED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { type QualityContract, type StandardEvaluationReport } from "./evaluators.js";
|
|
2
|
+
import type { BenchmarkRuntimeMode, StandardRunRecord } from "./run-record.js";
|
|
3
|
+
export type BenchmarkTask = {
|
|
4
|
+
id: string;
|
|
5
|
+
input: string;
|
|
6
|
+
quality?: QualityContract;
|
|
7
|
+
referenceOutputs?: Record<string, unknown>;
|
|
8
|
+
finalState?: unknown;
|
|
9
|
+
metadata?: Record<string, unknown>;
|
|
10
|
+
};
|
|
11
|
+
export type BenchmarkRuntime = {
|
|
12
|
+
mode: BenchmarkRuntimeMode;
|
|
13
|
+
run(task: BenchmarkTask, trial: number): Promise<BenchmarkRunOutput> | BenchmarkRunOutput;
|
|
14
|
+
};
|
|
15
|
+
export type BenchmarkRunOutput = {
|
|
16
|
+
record: StandardRunRecord;
|
|
17
|
+
finalState?: unknown;
|
|
18
|
+
};
|
|
19
|
+
export type BenchmarkSuiteInput = {
|
|
20
|
+
suiteId: string;
|
|
21
|
+
tasks: BenchmarkTask[];
|
|
22
|
+
runtimes: BenchmarkRuntime[];
|
|
23
|
+
trials?: number;
|
|
24
|
+
};
|
|
25
|
+
export type BenchmarkSuiteReport = {
|
|
26
|
+
schemaVersion: 1;
|
|
27
|
+
kind: "stable-harness.benchmark-report";
|
|
28
|
+
suiteId: string;
|
|
29
|
+
createdAt: string;
|
|
30
|
+
trials: number;
|
|
31
|
+
results: BenchmarkTaskResult[];
|
|
32
|
+
summary: BenchmarkSummary[];
|
|
33
|
+
};
|
|
34
|
+
export type BenchmarkTaskResult = {
|
|
35
|
+
taskId: string;
|
|
36
|
+
trial: number;
|
|
37
|
+
runtimeMode: BenchmarkRuntimeMode;
|
|
38
|
+
record: StandardRunRecord;
|
|
39
|
+
evaluation: StandardEvaluationReport;
|
|
40
|
+
};
|
|
41
|
+
export type BenchmarkSummary = {
|
|
42
|
+
runtimeMode: BenchmarkRuntimeMode;
|
|
43
|
+
total: number;
|
|
44
|
+
passed: number;
|
|
45
|
+
failed: number;
|
|
46
|
+
blocked: number;
|
|
47
|
+
needsReview: number;
|
|
48
|
+
passRate: number;
|
|
49
|
+
averageScores: Record<string, number>;
|
|
50
|
+
};
|
|
51
|
+
export declare function runBenchmarkSuite(input: BenchmarkSuiteInput): Promise<BenchmarkSuiteReport>;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{evaluateRunRecord as e}from"./evaluators.js";export async function runBenchmarkSuite(e){const t=e.trials??1,r=[];for(let a=0;a<t;a+=1)for(const t of e.tasks)for(const n of e.runtimes)r.push(await runTask(t,n,a));return{schemaVersion:1,kind:"stable-harness.benchmark-report",suiteId:e.suiteId,createdAt:(new Date).toISOString(),trials:t,results:r,summary:summarizeBenchmark(r)}}async function runTask(t,r,a){await void 0;const n=await r.run(t,a),o=e({record:n.record,contract:t.quality,finalState:n.finalState??t.finalState});return{taskId:t.id,trial:a,runtimeMode:r.mode,record:n.record,evaluation:o}}function summarizeBenchmark(e){const t=new Map;for(const r of e)t.set(r.runtimeMode,[...t.get(r.runtimeMode)??[],r]);return[...t].map(([e,t])=>function summarizeRuntime(e,t){const r=t.length,a=t.filter(e=>"pass"===e.evaluation.verdict).length;return{runtimeMode:e,total:r,passed:a,failed:t.filter(e=>"fail"===e.evaluation.verdict).length,blocked:t.filter(e=>"blocked"===e.evaluation.verdict).length,needsReview:t.filter(e=>"needs_review"===e.evaluation.verdict).length,passRate:r>0?a/r:0,averageScores:averageScores(t)}}(e,t))}function averageScores(e){const t=new Map;for(const r of e)for(const[e,a]of Object.entries(r.evaluation.scores))t.set(e,[...t.get(e)??[],a]);return Object.fromEntries([...t].map(([e,t])=>[e,t.reduce((e,t)=>e+t,0)/t.length]))}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import type { BenchmarkRuntimeMode, StandardRunRecord, StandardTrajectoryStep } from "./run-record.js";
|
|
2
|
+
export type QualityContract = {
|
|
3
|
+
finalResponse?: {
|
|
4
|
+
rubric?: string;
|
|
5
|
+
requiredSubstrings?: string[];
|
|
6
|
+
};
|
|
7
|
+
requiredEvidence?: {
|
|
8
|
+
tools?: string[];
|
|
9
|
+
};
|
|
10
|
+
tools?: {
|
|
11
|
+
expected?: ExpectedToolCall[];
|
|
12
|
+
validateArguments?: boolean;
|
|
13
|
+
};
|
|
14
|
+
trajectory?: {
|
|
15
|
+
expected?: ExpectedTrajectoryStep[];
|
|
16
|
+
mode?: "any_order" | "ordered" | "judge";
|
|
17
|
+
};
|
|
18
|
+
workflow?: {
|
|
19
|
+
finalStateChecks?: WorkflowFinalStateCheck[];
|
|
20
|
+
};
|
|
21
|
+
controlStates?: {
|
|
22
|
+
preserveAsBlockers?: boolean;
|
|
23
|
+
};
|
|
24
|
+
approvals?: {
|
|
25
|
+
requiredFor?: string[];
|
|
26
|
+
};
|
|
27
|
+
};
|
|
28
|
+
export type ExpectedToolCall = {
|
|
29
|
+
toolId: string;
|
|
30
|
+
arguments?: Record<string, unknown>;
|
|
31
|
+
};
|
|
32
|
+
export type ExpectedTrajectoryStep = {
|
|
33
|
+
kind?: StandardTrajectoryStep["kind"];
|
|
34
|
+
name?: string;
|
|
35
|
+
toolId?: string;
|
|
36
|
+
subagentType?: string;
|
|
37
|
+
status?: StandardTrajectoryStep["status"];
|
|
38
|
+
};
|
|
39
|
+
export type WorkflowFinalStateCheck = {
|
|
40
|
+
path: string;
|
|
41
|
+
equals?: unknown;
|
|
42
|
+
includes?: unknown;
|
|
43
|
+
exists?: boolean;
|
|
44
|
+
};
|
|
45
|
+
export type EvaluationVerdict = "pass" | "fail" | "blocked" | "needs_review";
|
|
46
|
+
export type StandardEvaluationReport = {
|
|
47
|
+
schemaVersion: 1;
|
|
48
|
+
kind: "stable-harness.evaluation-report";
|
|
49
|
+
requestId: string;
|
|
50
|
+
runtimeMode: BenchmarkRuntimeMode;
|
|
51
|
+
verdict: EvaluationVerdict;
|
|
52
|
+
scores: Record<string, number>;
|
|
53
|
+
checks: EvaluationCheck[];
|
|
54
|
+
};
|
|
55
|
+
export type EvaluationCheck = {
|
|
56
|
+
id: string;
|
|
57
|
+
category: "final_response" | "tool_call" | "trajectory" | "workflow" | "control_state" | "approval";
|
|
58
|
+
verdict: EvaluationVerdict;
|
|
59
|
+
message: string;
|
|
60
|
+
score: number;
|
|
61
|
+
expected?: unknown;
|
|
62
|
+
observed?: unknown;
|
|
63
|
+
};
|
|
64
|
+
export declare function evaluateRunRecord(input: {
|
|
65
|
+
record: StandardRunRecord;
|
|
66
|
+
contract?: QualityContract;
|
|
67
|
+
finalState?: unknown;
|
|
68
|
+
}): StandardEvaluationReport;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export function evaluateRunRecord(e){const t=e.contract??{},r=[...evaluateFinalResponse(e.record,t),...evaluateRequiredEvidence(e.record,t),...evaluateToolCalls(e.record,t),...evaluateTrajectory(e.record,t),...evaluateWorkflow(e.finalState,t),...evaluateControlStates(e.record,t),...evaluateApprovals(e.record,t)];return{schemaVersion:1,kind:"stable-harness.evaluation-report",requestId:e.record.request.requestId,runtimeMode:e.record.runtimeMode,verdict:summarizeVerdict(r),scores:summarizeScores(r),checks:r}}function evaluateFinalResponse(e,t){const r=t.finalResponse?.requiredSubstrings??[];if(0===r.length)return[];const o=e.request.output??"",n=r.filter(e=>!o.includes(e));return[check("final_response.required_substrings","final_response",0===n.length,"final response contains required substrings",r,n)]}function evaluateRequiredEvidence(e,t){const r=t.requiredEvidence?.tools??[];if(0===r.length)return[];const o=new Set(function completedTools(e){return e.filter(e=>"completed"===e.status&&e.toolId).map(e=>e.toolId)}(e.trajectory)),n=r.filter(e=>!o.has(e));return[check("evidence.required_tools","tool_call",0===n.length,"required evidence tools completed",r,n)]}function evaluateToolCalls(e,t){const r=t.tools?.expected??[];return 0===r.length?[]:r.map((r,o)=>{const n=e.trajectory.find(e=>e.toolId===r.toolId&&"completed"===e.status),s=!t.tools?.validateArguments||!r.arguments||function subsetMatches(e,t){return!!isRecord(e)&&Object.entries(t).every(([t,r])=>deepEqual(e[t],r))}(n?.arguments,r.arguments);return check(`tool.expected.${o}`,"tool_call",Boolean(n)&&s,"expected tool call completed with valid arguments",r,n)})}function evaluateTrajectory(e,t){const r=t.trajectory?.expected??[];return 0===r.length?[]:"judge"===t.trajectory?.mode?[needsReview("trajectory.judge","trajectory","trajectory requires an external judge",r,e.trajectory)]:[check("trajectory.expected","trajectory","ordered"===t.trajectory?.mode?function orderedMatch(e,t){let r=0;for(const o of e)if(stepMatches(o,t[r])&&(r+=1),r===t.length)return!0;return 0===t.length}(e.trajectory,r):r.every(t=>e.trajectory.some(e=>stepMatches(e,t))),"expected trajectory steps matched",r,e.trajectory)]}function evaluateWorkflow(e,t){const r=t.workflow?.finalStateChecks??[];return 0===r.length?[]:r.map((t,r)=>{const o=function readPath(e,t){return t.split(".").filter(Boolean).reduce((e,t)=>isRecord(e)?e[t]:void 0,e)}(e,t.path),n=function finalStatePasses(e,t){return!(void 0!==t.exists&&t.exists!==(void 0!==e)||"equals"in t&&!deepEqual(e,t.equals)||"includes"in t&&!function includesValue(e,t){return Array.isArray(e)?e.some(e=>deepEqual(e,t)):String(e??"").includes(String(t))}(e,t.includes))}(o,t);return check(`workflow.final_state.${r}`,"workflow",n,"workflow final state check passed",t,o)})}function evaluateControlStates(e,t){if(!t.controlStates?.preserveAsBlockers)return[];const r=e.trajectory.filter(e=>"blocked"===e.status),o=e.request.output??"";return[check("control.blocker_preserved","control_state",0===r.length||r.some(e=>o.includes(e.name)),"blocked control states are visible in final output",r.map(e=>e.name),o)]}function evaluateApprovals(e,t){const r=t.approvals?.requiredFor??[];if(0===r.length)return[];const o=e.trajectory.filter(e=>"approval"===e.kind).map(e=>e.name);return[check("approval.required","approval",0===r.filter(e=>!o.some(t=>t.includes(e))).length,"required approval flow observed",r,o)]}function stepMatches(e,t){return!!t&&!(t.kind&&e.kind!==t.kind||t.name&&e.name!==t.name||t.toolId&&e.toolId!==t.toolId||t.subagentType&&e.subagentType!==t.subagentType||t.status&&e.status!==t.status)}function check(e,t,r,o,n,s){return{id:e,category:t,verdict:r?"pass":"fail",message:o,score:r?1:0,expected:n,observed:s}}function needsReview(e,t,r,o,n){return{id:e,category:t,verdict:"needs_review",message:r,score:0,expected:o,observed:n}}function summarizeVerdict(e){return e.some(e=>"blocked"===e.verdict)?"blocked":e.some(e=>"fail"===e.verdict)?"fail":e.some(e=>"needs_review"===e.verdict)?"needs_review":"pass"}function summarizeScores(e){const t=new Map;for(const r of e)t.set(r.category,[...t.get(r.category)??[],r]);return Object.fromEntries([...t].map(([e,t])=>[e,t.reduce((e,t)=>e+t.score,0)/t.length]))}function deepEqual(e,t){return JSON.stringify(e)===JSON.stringify(t)}function isRecord(e){return"object"==typeof e&&null!==e&&!Array.isArray(e)}
|
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
export { createTraceRecorder, createReplayManifest, createEvaluationBundle } from "./trace.js";
|
|
2
2
|
export { computeToolCallMetrics } from "./tool-call-metrics.js";
|
|
3
|
+
export { createStandardRunRecord, createExternalRunRecord, createLangSmithEvaluationTarget, projectTrajectory } from "./run-record.js";
|
|
4
|
+
export { evaluateRunRecord } from "./evaluators.js";
|
|
5
|
+
export { runBenchmarkSuite } from "./benchmark.js";
|
|
3
6
|
export type { EvaluationBundle, ReplayManifest } from "./types.js";
|
|
4
7
|
export type { PerToolCallMetrics, ToolCallMetrics } from "./tool-call-metrics.js";
|
|
8
|
+
export type { BenchmarkRuntimeMode, ExternalRunRecordInput, LangSmithEvaluationTarget, StandardRunRecord, StandardTrajectoryStep, } from "./run-record.js";
|
|
9
|
+
export type { EvaluationCheck, EvaluationVerdict, ExpectedToolCall, ExpectedTrajectoryStep, QualityContract, StandardEvaluationReport, WorkflowFinalStateCheck, } from "./evaluators.js";
|
|
10
|
+
export type { BenchmarkRunOutput, BenchmarkRuntime, BenchmarkSuiteInput, BenchmarkSuiteReport, BenchmarkSummary, BenchmarkTask, BenchmarkTaskResult, } from "./benchmark.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
export{createTraceRecorder,createReplayManifest,createEvaluationBundle}from"./trace.js";export{computeToolCallMetrics}from"./tool-call-metrics.js";
|
|
1
|
+
export{createTraceRecorder,createReplayManifest,createEvaluationBundle}from"./trace.js";export{computeToolCallMetrics}from"./tool-call-metrics.js";export{createStandardRunRecord,createExternalRunRecord,createLangSmithEvaluationTarget,projectTrajectory}from"./run-record.js";export{evaluateRunRecord}from"./evaluators.js";export{runBenchmarkSuite}from"./benchmark.js";
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { type RuntimeEvent, type RuntimeRunRecord, type RuntimeTraceSpan } from "@stable-harness/core";
|
|
2
|
+
export type BenchmarkRuntimeMode = "pure_deepagents" | "stable_harness_passthrough" | "stable_harness_quality_gates" | "stable_harness_recovery";
|
|
3
|
+
export type StandardRunRecord = {
|
|
4
|
+
schemaVersion: 1;
|
|
5
|
+
kind: "stable-harness.run-record";
|
|
6
|
+
runtimeMode: BenchmarkRuntimeMode;
|
|
7
|
+
request: {
|
|
8
|
+
requestId: string;
|
|
9
|
+
sessionId: string;
|
|
10
|
+
agentId: string;
|
|
11
|
+
input: string;
|
|
12
|
+
state: RuntimeRunRecord["state"];
|
|
13
|
+
output?: string;
|
|
14
|
+
error?: string;
|
|
15
|
+
};
|
|
16
|
+
trajectory: StandardTrajectoryStep[];
|
|
17
|
+
spans: RuntimeTraceSpan[];
|
|
18
|
+
raw: {
|
|
19
|
+
events: RuntimeEvent[];
|
|
20
|
+
backendTrace?: unknown;
|
|
21
|
+
};
|
|
22
|
+
artifacts: RuntimeRunRecord["artifacts"];
|
|
23
|
+
metadata?: Record<string, unknown>;
|
|
24
|
+
};
|
|
25
|
+
export type StandardTrajectoryStep = {
|
|
26
|
+
index: number;
|
|
27
|
+
kind: "message" | "tool" | "subagent" | "approval" | "quality" | "control" | "artifact" | "other";
|
|
28
|
+
name: string;
|
|
29
|
+
status: "started" | "completed" | "failed" | "blocked" | "event";
|
|
30
|
+
toolId?: string;
|
|
31
|
+
subagentType?: string;
|
|
32
|
+
arguments?: unknown;
|
|
33
|
+
output?: unknown;
|
|
34
|
+
sourceEventType: RuntimeEvent["type"];
|
|
35
|
+
sourceEventId?: string;
|
|
36
|
+
};
|
|
37
|
+
export type LangSmithEvaluationTarget = {
|
|
38
|
+
inputs: Record<string, unknown>;
|
|
39
|
+
outputs: Record<string, unknown>;
|
|
40
|
+
referenceOutputs?: Record<string, unknown>;
|
|
41
|
+
metadata: Record<string, unknown>;
|
|
42
|
+
};
|
|
43
|
+
export type ExternalRunRecordInput = {
|
|
44
|
+
requestId: string;
|
|
45
|
+
sessionId?: string;
|
|
46
|
+
agentId?: string;
|
|
47
|
+
input: string;
|
|
48
|
+
output?: string;
|
|
49
|
+
error?: string;
|
|
50
|
+
state?: RuntimeRunRecord["state"];
|
|
51
|
+
runtimeMode: BenchmarkRuntimeMode;
|
|
52
|
+
trajectory?: Array<Omit<StandardTrajectoryStep, "index" | "sourceEventType"> & {
|
|
53
|
+
sourceEventType?: RuntimeEvent["type"];
|
|
54
|
+
}>;
|
|
55
|
+
events?: RuntimeEvent[];
|
|
56
|
+
backendTrace?: unknown;
|
|
57
|
+
artifacts?: RuntimeRunRecord["artifacts"];
|
|
58
|
+
metadata?: Record<string, unknown>;
|
|
59
|
+
};
|
|
60
|
+
export declare function createStandardRunRecord(input: {
|
|
61
|
+
run: RuntimeRunRecord;
|
|
62
|
+
runtimeMode?: BenchmarkRuntimeMode;
|
|
63
|
+
backendTrace?: unknown;
|
|
64
|
+
metadata?: Record<string, unknown>;
|
|
65
|
+
}): StandardRunRecord;
|
|
66
|
+
export declare function createExternalRunRecord(input: ExternalRunRecordInput): StandardRunRecord;
|
|
67
|
+
export declare function createLangSmithEvaluationTarget(record: StandardRunRecord, referenceOutputs?: Record<string, unknown>): LangSmithEvaluationTarget;
|
|
68
|
+
export declare function projectTrajectory(events: RuntimeEvent[]): StandardTrajectoryStep[];
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{projectRuntimeTraceSpans as e}from"@stable-harness/core";export function createStandardRunRecord(t){const r=e(t.run);return{schemaVersion:1,kind:"stable-harness.run-record",runtimeMode:t.runtimeMode??"stable_harness_passthrough",request:{requestId:t.run.requestId,sessionId:t.run.sessionId,agentId:t.run.agentId,input:t.run.input,state:t.run.state,output:t.run.output,error:t.run.error},trajectory:projectTrajectory(t.run.events),spans:r,raw:{events:t.run.events,backendTrace:t.backendTrace},artifacts:t.run.artifacts,metadata:t.metadata??t.run.metadata}}export function createExternalRunRecord(e){const t=e.sessionId??e.requestId,r=e.agentId??"deepagents";return{schemaVersion:1,kind:"stable-harness.run-record",runtimeMode:e.runtimeMode,request:{requestId:e.requestId,sessionId:t,agentId:r,input:e.input,state:e.state??(e.error?"failed":"completed"),output:e.output,error:e.error},trajectory:normalizeExternalTrajectory(e.trajectory??[]),spans:[],raw:{events:e.events??[],backendTrace:e.backendTrace},artifacts:e.artifacts??[],metadata:e.metadata}}export function createLangSmithEvaluationTarget(e,t){return{inputs:{input:e.request.input},outputs:{output:e.request.output??"",trajectory:e.trajectory.map(e=>({kind:e.kind,name:e.name,status:e.status,toolId:e.toolId,subagentType:e.subagentType}))},referenceOutputs:t,metadata:{requestId:e.request.requestId,sessionId:e.request.sessionId,agentId:e.request.agentId,runtimeMode:e.runtimeMode,state:e.request.state}}}export function projectTrajectory(e){return e.map(projectStep).filter(isStep).map((e,t)=>({...e,index:t}))}function normalizeExternalTrajectory(e){return(e??[]).map((e,t)=>({...e,index:t,sourceEventType:e.sourceEventType??"runtime.adapter.event"}))}function projectStep(e){return"runtime.tool.direct.started"===e.type?toolStep(e,"started"):"runtime.tool.direct.completed"===e.type?toolStep(e,"completed",e.output):"runtime.tool.failure"===e.type?toolStep(e,"failed",e.error):"runtime.sandbox.decision"===e.type?namedStep(e,"control",`sandbox:${e.toolId}`,"event"):"runtime.tool.circuit.opened"===e.type?namedStep(e,"control",`circuit:${e.toolId}`,"blocked"):"runtime.execution.contract.failed"===e.type?namedStep(e,"quality",e.reason,"blocked"):e.type.startsWith("runtime.approval.")?namedStep(e,"approval",e.type,"blocked"):"runtime.artifact.created"===e.type?namedStep(e,"artifact",e.artifact.id,"event"):"runtime.adapter.event"===e.type?function adapterStep(e){const t=function isRecord(e){return"object"==typeof e&&null!==e&&!Array.isArray(e)}(e.event)?e.event:{};return function isToolStart(e){return"deepagents.tool_execution.start"===e.eventType||"agent.tool.start"===e.phase}(t)?adapterToolStep(e,t,"started"):function isToolResult(e){return"deepagents.tool_execution.result"===e.eventType||"agent.tool.result"===e.phase}(t)?adapterToolStep(e,t,"completed"):"delegation"===t.traceType?function delegationStep(e,t){const r=readString(t.traceLabel)?.endsWith(".completed")?"completed":"started";return{...namedStep(e,"subagent",readString(t.subagentType)??"task",r,t),toolId:"task",subagentType:readString(t.subagentType),arguments:t.taskInput??t.args,output:t.output}}(e,t):namedStep(e,"other",readString(t.phase)??readString(t.eventType)??"runtime.adapter.event","event",t)}(e):e.type.startsWith("runtime.quality.")?namedStep(e,"quality",e.type,"event"):e.type.startsWith("runtime.request.")?namedStep(e,"message",e.type,function requestStatus(e){return"runtime.request.completed"===e.type?"completed":"runtime.request.failed"===e.type?"failed":"runtime.request.cancelled"===e.type?"blocked":"started"}(e)):void 0}function adapterToolStep(e,t,r){const n=readString(t.toolId)??readString(t.name)??"unknown";return{...namedStep(e,"task"===n?"subagent":"tool",n,r,t),toolId:n,subagentType:readString(t.subagentType),arguments:t.args,output:t.output}}function toolStep(e,t,r){return{...namedStep(e,"tool",e.toolId,t),toolId:e.toolId,output:r}}function namedStep(e,t,r,n,a){return{kind:t,name:r,status:n,arguments:a,sourceEventType:e.type,sourceEventId:e.eventId}}function isStep(e){return void 0!==e}function readString(e){return"string"==typeof e&&e.trim()?e:void 0}
|