@alis-build/harness-eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +700 -0
- package/dist/adapters/claude-code/index.d.ts +3 -0
- package/dist/adapters/claude-code/index.js +2 -0
- package/dist/build-DsVJ_UeU.js +1396 -0
- package/dist/build-DsVJ_UeU.js.map +1 -0
- package/dist/cardinality-DlE44e-4.js +31 -0
- package/dist/cardinality-DlE44e-4.js.map +1 -0
- package/dist/claude-code-ycT0JQZF.js +563 -0
- package/dist/claude-code-ycT0JQZF.js.map +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +623 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/config/loader.d.ts +2 -0
- package/dist/config/loader.js +2 -0
- package/dist/index-6Z17eKZx.d.ts +72 -0
- package/dist/index.d.ts +725 -0
- package/dist/index.js +5 -0
- package/dist/loader-BCnFJ8rm.js +717 -0
- package/dist/loader-BCnFJ8rm.js.map +1 -0
- package/dist/loader-DTvoVfN0.d.ts +33 -0
- package/dist/rolldown-runtime-D7D4PA-g.js +13 -0
- package/dist/runner/suite.d.ts +2 -0
- package/dist/runner/suite.js +2 -0
- package/dist/suite-BoOvK_lq.d.ts +7 -0
- package/dist/suite-chj0j22j.js +684 -0
- package/dist/suite-chj0j22j.js.map +1 -0
- package/dist/types-B9H4IZtA.d.ts +305 -0
- package/dist/types-BQol062t.d.ts +292 -0
- package/package.json +74 -0
- package/schemas/eval-interchange-agent-trace.schema.json +322 -0
- package/schemas/eval-interchange-proto-instance.schema.json +106 -0
- package/schemas/eval-interchange.schema.json +140 -0
- package/schemas/eval-run-envelope.schema.json +2195 -0
- package/schemas/trajectory-view.schema.json +441 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-B9H4IZtA.js";
|
|
2
|
+
import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-6Z17eKZx.js";
|
|
3
|
+
import { _ as ObjectPredicate, a as ProgressEvent, b as ToolPattern, c as RunSuiteOptions, d as TestSuite, f as Assertion, g as LeafPredicate, h as CompoundPredicate, i as ProgressCallback, l as SuiteReport, m as Cardinality, n as CellReport, o as RepetitionError, p as AssertionResult, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as Predicate, y as ThresholdedAssertion } from "./types-BQol062t.js";
|
|
4
|
+
import { i as GradingConfig, r as parseSuite, s as ConfigError, t as loadSuite } from "./loader-DTvoVfN0.js";
|
|
5
|
+
import { t as runSuite } from "./suite-BoOvK_lq.js";
|
|
6
|
+
import { Readable } from "node:stream";
|
|
7
|
+
|
|
8
|
+
//#region src/types/eval-interchange.d.ts
|
|
9
|
+
/**
|
|
10
|
+
* TypeScript types for eval interchange output.
|
|
11
|
+
*/
|
|
12
|
+
interface InterchangeToolCall {
|
|
13
|
+
tool_name: string;
|
|
14
|
+
tool_input: string;
|
|
15
|
+
}
|
|
16
|
+
interface InterchangeTrajectory {
|
|
17
|
+
tool_calls: InterchangeToolCall[];
|
|
18
|
+
}
|
|
19
|
+
interface TabularToolCall {
|
|
20
|
+
tool_name: string;
|
|
21
|
+
tool_input: unknown;
|
|
22
|
+
}
|
|
23
|
+
interface ContentPart {
|
|
24
|
+
text?: string;
|
|
25
|
+
function_call?: {
|
|
26
|
+
name: string;
|
|
27
|
+
args: unknown;
|
|
28
|
+
};
|
|
29
|
+
function_response?: {
|
|
30
|
+
name: string;
|
|
31
|
+
response: unknown;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
interface AgentEvent {
|
|
35
|
+
author: string;
|
|
36
|
+
content: {
|
|
37
|
+
parts: ContentPart[];
|
|
38
|
+
};
|
|
39
|
+
event_time?: string;
|
|
40
|
+
state_delta?: Record<string, unknown>;
|
|
41
|
+
active_tools?: Array<{
|
|
42
|
+
name: string;
|
|
43
|
+
}>;
|
|
44
|
+
}
|
|
45
|
+
interface ConversationTurn {
|
|
46
|
+
turn_index: number;
|
|
47
|
+
turn_id?: string;
|
|
48
|
+
events: AgentEvent[];
|
|
49
|
+
}
|
|
50
|
+
interface AgentConfig {
|
|
51
|
+
agent_id: string;
|
|
52
|
+
agent_type?: string;
|
|
53
|
+
description?: string;
|
|
54
|
+
instruction?: string;
|
|
55
|
+
tools?: Array<{
|
|
56
|
+
name: string;
|
|
57
|
+
}>;
|
|
58
|
+
sub_agents?: string[];
|
|
59
|
+
}
|
|
60
|
+
interface AgentTrace {
|
|
61
|
+
agents: Record<string, AgentConfig>;
|
|
62
|
+
turns: ConversationTurn[];
|
|
63
|
+
}
|
|
64
|
+
interface EvalDatasetRow {
|
|
65
|
+
prompt?: string;
|
|
66
|
+
response?: string;
|
|
67
|
+
reference?: string;
|
|
68
|
+
predicted_trajectory: TabularToolCall[];
|
|
69
|
+
reference_trajectory?: TabularToolCall[];
|
|
70
|
+
latency_in_seconds: number;
|
|
71
|
+
failure: 0 | 1;
|
|
72
|
+
human_ratings?: Record<string, number>;
|
|
73
|
+
}
|
|
74
|
+
interface ProtoTrajectoryInstance {
|
|
75
|
+
predicted_trajectory: InterchangeTrajectory;
|
|
76
|
+
reference_trajectory?: InterchangeTrajectory;
|
|
77
|
+
prompt?: string;
|
|
78
|
+
response?: string;
|
|
79
|
+
reference?: string;
|
|
80
|
+
}
|
|
81
|
+
interface TrajectoryMetrics {
|
|
82
|
+
trajectory_exact_match: number;
|
|
83
|
+
trajectory_in_order_match: number;
|
|
84
|
+
trajectory_any_order_match: number;
|
|
85
|
+
trajectory_precision: number;
|
|
86
|
+
trajectory_recall: number;
|
|
87
|
+
trajectory_single_tool_use: number;
|
|
88
|
+
}
|
|
89
|
+
interface ToolCallMetrics {
|
|
90
|
+
tool_call_valid: number;
|
|
91
|
+
tool_name_match: number;
|
|
92
|
+
tool_parameter_key_match: number;
|
|
93
|
+
tool_parameter_kv_match: number;
|
|
94
|
+
}
|
|
95
|
+
//#endregion
|
|
96
|
+
//#region src/grader/types.d.ts
|
|
97
|
+
/**
|
|
98
|
+
* Outcome grading types (LLM-as-judge layer).
|
|
99
|
+
*
|
|
100
|
+
* Behavioral assertions live in harness-eval assertions; expectations here
|
|
101
|
+
* are natural-language outcome checks graded from trajectory transcripts.
|
|
102
|
+
*/
|
|
103
|
+
interface GradedExpectation {
|
|
104
|
+
text: string;
|
|
105
|
+
passed: boolean;
|
|
106
|
+
evidence: string;
|
|
107
|
+
}
|
|
108
|
+
interface GradingSummary {
|
|
109
|
+
passed: number;
|
|
110
|
+
failed: number;
|
|
111
|
+
total: number;
|
|
112
|
+
passRate: number;
|
|
113
|
+
}
|
|
114
|
+
interface EvalFeedbackSuggestion {
|
|
115
|
+
assertion?: string;
|
|
116
|
+
reason: string;
|
|
117
|
+
}
|
|
118
|
+
interface EvalFeedback {
|
|
119
|
+
suggestions: EvalFeedbackSuggestion[];
|
|
120
|
+
overall: string;
|
|
121
|
+
}
|
|
122
|
+
/** Grading result for one repetition. */
|
|
123
|
+
interface RepGradingResult {
|
|
124
|
+
caseId: string;
|
|
125
|
+
cellLabel: string;
|
|
126
|
+
repetitionIndex: number;
|
|
127
|
+
prompt: string;
|
|
128
|
+
expectations: GradedExpectation[];
|
|
129
|
+
summary: GradingSummary;
|
|
130
|
+
evalFeedback?: EvalFeedback;
|
|
131
|
+
/** Set when the grader subprocess failed or returned unparseable output. */
|
|
132
|
+
graderError?: string;
|
|
133
|
+
durationMs: number;
|
|
134
|
+
}
|
|
135
|
+
interface SuiteGradingReport {
|
|
136
|
+
gradedAt: string;
|
|
137
|
+
sourceReport: string;
|
|
138
|
+
/** Grading YAML path when `--config` was used. */
|
|
139
|
+
gradingConfigPath?: string;
|
|
140
|
+
results: RepGradingResult[];
|
|
141
|
+
summary: GradingSummary;
|
|
142
|
+
}
|
|
143
|
+
interface GradeReportOptions {
|
|
144
|
+
/** Path to the report being graded (stored in output). */
|
|
145
|
+
sourceReport?: string;
|
|
146
|
+
/** Path to expectations YAML/JSON sidecar (case id → strings). */
|
|
147
|
+
expectationsPath?: string;
|
|
148
|
+
/** Claude binary for grading. Default: `claude`. */
|
|
149
|
+
binary?: string;
|
|
150
|
+
/** Model for the grader subprocess. */
|
|
151
|
+
model?: string;
|
|
152
|
+
/** Optional judge prompt prefix (maps to upstream system_instruction). */
|
|
153
|
+
systemInstruction?: string;
|
|
154
|
+
/** Timeout per grading call (ms). Default 300000 (5 min). */
|
|
155
|
+
timeoutMs?: number;
|
|
156
|
+
/** Max concurrent grader subprocesses. Default 2. */
|
|
157
|
+
maxConcurrent?: number;
|
|
158
|
+
/** Process env for the judge subprocess (merged over inherited env). */
|
|
159
|
+
env?: Record<string, string>;
|
|
160
|
+
/** Working directory for the judge subprocess. */
|
|
161
|
+
cwd?: string;
|
|
162
|
+
/** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
|
|
163
|
+
claudeCode?: Record<string, unknown>;
|
|
164
|
+
/** Path to grading YAML when `--config` was used. */
|
|
165
|
+
gradingConfigPath?: string;
|
|
166
|
+
/** Inject a custom grader (for tests). */
|
|
167
|
+
gradeFn?: GraderFn;
|
|
168
|
+
onProgress?: (event: GradeProgressEvent) => void;
|
|
169
|
+
}
|
|
170
|
+
type GradeProgressEvent = {
|
|
171
|
+
kind: "grade-start";
|
|
172
|
+
total: number;
|
|
173
|
+
} | {
|
|
174
|
+
kind: "grade-complete";
|
|
175
|
+
caseId: string;
|
|
176
|
+
cellLabel: string;
|
|
177
|
+
repetitionIndex: number;
|
|
178
|
+
passed: number;
|
|
179
|
+
failed: number;
|
|
180
|
+
durationMs: number;
|
|
181
|
+
graderError?: string;
|
|
182
|
+
} | {
|
|
183
|
+
kind: "grade-done";
|
|
184
|
+
durationMs: number;
|
|
185
|
+
totalExpectations: number;
|
|
186
|
+
passedExpectations: number;
|
|
187
|
+
};
|
|
188
|
+
type GraderFn = (input: GraderInput) => Promise<GraderOutput>;
|
|
189
|
+
interface GraderInput {
|
|
190
|
+
prompt: string;
|
|
191
|
+
transcript: string;
|
|
192
|
+
expectations: string[];
|
|
193
|
+
systemInstruction?: string;
|
|
194
|
+
}
|
|
195
|
+
interface GraderOutput {
|
|
196
|
+
expectations: GradedExpectation[];
|
|
197
|
+
summary: GradingSummary;
|
|
198
|
+
evalFeedback?: EvalFeedback;
|
|
199
|
+
error?: string;
|
|
200
|
+
}
|
|
201
|
+
//#endregion
|
|
202
|
+
//#region src/types/eval-record.d.ts
|
|
203
|
+
/** Schema version for {@link EvalRunEnvelope} JSON documents. */
|
|
204
|
+
declare const EVAL_RUN_SCHEMA_VERSION = "1.0";
|
|
205
|
+
/** Schema version embedded in each {@link TrajectoryView} at export time. */
|
|
206
|
+
declare const TRAJECTORY_SCHEMA_VERSION = "1.0";
|
|
207
|
+
/** Link to the suite spec that produced a run. */
|
|
208
|
+
interface SuiteReference {
|
|
209
|
+
/** Absolute or repo-relative path to the suite YAML. */
|
|
210
|
+
uri?: string;
|
|
211
|
+
/** Stable suite identifier when known (e.g. case bundle name). */
|
|
212
|
+
id?: string;
|
|
213
|
+
/** SHA-256 or similar hash of suite file contents. */
|
|
214
|
+
contentHash?: string;
|
|
215
|
+
}
|
|
216
|
+
/** Harness that executed the run. */
|
|
217
|
+
interface HarnessInfo {
|
|
218
|
+
/** Adapter id from suite YAML, e.g. `claude-code`. */
|
|
219
|
+
adapter: string;
|
|
220
|
+
/** harness-eval package version when envelope was built. */
|
|
221
|
+
frameworkVersion?: string;
|
|
222
|
+
/** Optional harness binary version (e.g. `claude -v`). */
|
|
223
|
+
harnessVersion?: string;
|
|
224
|
+
}
|
|
225
|
+
/** CI, git, or runtime provenance for correlation in the DB. */
|
|
226
|
+
interface EvalProvenance {
|
|
227
|
+
runId?: string;
|
|
228
|
+
ci?: {
|
|
229
|
+
provider?: string;
|
|
230
|
+
jobId?: string;
|
|
231
|
+
pipelineId?: string;
|
|
232
|
+
url?: string;
|
|
233
|
+
};
|
|
234
|
+
git?: {
|
|
235
|
+
commit?: string;
|
|
236
|
+
branch?: string;
|
|
237
|
+
repository?: string;
|
|
238
|
+
};
|
|
239
|
+
pluginVersion?: string;
|
|
240
|
+
triggeredBy?: string;
|
|
241
|
+
[key: string]: unknown;
|
|
242
|
+
}
|
|
243
|
+
/** Aggregate behavioral summary for the run. */
|
|
244
|
+
interface EvalRunSummary {
|
|
245
|
+
cellsTotal: number;
|
|
246
|
+
cellsPassed: number;
|
|
247
|
+
/** All cells passed behavioral assertion thresholds. */
|
|
248
|
+
behavioralPass: boolean;
|
|
249
|
+
/** All graded expectations passed (when outcome layer present). */
|
|
250
|
+
outcomePass?: boolean;
|
|
251
|
+
}
|
|
252
|
+
/** Identity of the judge that produced outcome grades. */
|
|
253
|
+
interface JudgeInfo {
|
|
254
|
+
id: string;
|
|
255
|
+
model?: string;
|
|
256
|
+
version?: string;
|
|
257
|
+
}
|
|
258
|
+
/** Outcome grades for one repetition (built-in or external judge). */
|
|
259
|
+
interface OutcomeGrades {
|
|
260
|
+
judge: JudgeInfo;
|
|
261
|
+
expectations: GradedExpectation[];
|
|
262
|
+
summary: GradingSummary;
|
|
263
|
+
evalFeedback?: {
|
|
264
|
+
suggestions: Array<{
|
|
265
|
+
assertion?: string;
|
|
266
|
+
reason: string;
|
|
267
|
+
}>;
|
|
268
|
+
overall: string;
|
|
269
|
+
};
|
|
270
|
+
error?: string;
|
|
271
|
+
}
|
|
272
|
+
/** Score from an external eval framework (LangSmith, Braintrust, custom). */
|
|
273
|
+
interface ExternalScore {
|
|
274
|
+
source: string;
|
|
275
|
+
metric: string;
|
|
276
|
+
value: number | boolean | string;
|
|
277
|
+
metadata?: Record<string, unknown>;
|
|
278
|
+
}
|
|
279
|
+
/** Optional large or vendor-specific blobs (store by reference in DB when possible). */
|
|
280
|
+
interface EvalArtifacts {
|
|
281
|
+
/** Claude Code `stream-json` lines — debug only, not cross-harness. */
|
|
282
|
+
rawStreamEvents?: unknown[];
|
|
283
|
+
/** URI to OTLP JSON (S3, GCS, etc.). */
|
|
284
|
+
otlpTraceUri?: string;
|
|
285
|
+
/** Text transcript for judges (`trajectoryToTranscript`). */
|
|
286
|
+
transcript?: string;
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* One harness invocation — the unit external judges and trajectory queries use.
|
|
290
|
+
*/
|
|
291
|
+
interface EvalRepetition {
|
|
292
|
+
repetitionIndex: number;
|
|
293
|
+
durationMs: number;
|
|
294
|
+
/** Normalized harness session. Required when the harness completed with a view. */
|
|
295
|
+
trajectory?: TrajectoryView & {
|
|
296
|
+
schemaVersion: string;
|
|
297
|
+
};
|
|
298
|
+
diagnostics?: Partial<AdapterDiagnostics>;
|
|
299
|
+
assertionResults: AssertionResult[];
|
|
300
|
+
outcomeGrades?: OutcomeGrades;
|
|
301
|
+
externalScores?: ExternalScore[];
|
|
302
|
+
artifacts?: EvalArtifacts;
|
|
303
|
+
/** Interchange-format predicted tool-call trajectory. */
|
|
304
|
+
predicted_trajectory?: InterchangeToolCall[];
|
|
305
|
+
/** Full multi-turn agent trace in interchange format. */
|
|
306
|
+
agent_trace?: AgentTrace;
|
|
307
|
+
/** Session latency in seconds (interchange field). */
|
|
308
|
+
latency_in_seconds?: number;
|
|
309
|
+
/** 1 when the harness run failed, 0 on success (interchange field). */
|
|
310
|
+
failure?: 0 | 1;
|
|
311
|
+
/** Trajectory-level metrics when reference_trajectory is provided. */
|
|
312
|
+
trajectoryMetrics?: TrajectoryMetrics;
|
|
313
|
+
/** Tool-call-level metrics when reference_trajectory is provided. */
|
|
314
|
+
toolCallMetrics?: ToolCallMetrics;
|
|
315
|
+
error?: {
|
|
316
|
+
message: string;
|
|
317
|
+
diagnostics?: Partial<AdapterDiagnostics>;
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
/** Behavioral stats for one assertion across repetitions in a cell. */
|
|
321
|
+
interface EvalAssertionStat {
|
|
322
|
+
description: string;
|
|
323
|
+
threshold: number;
|
|
324
|
+
passedCount: number;
|
|
325
|
+
evaluatedCount: number;
|
|
326
|
+
passRate: number;
|
|
327
|
+
meetsThreshold: boolean;
|
|
328
|
+
}
|
|
329
|
+
/** One (test case × matrix cell) result. */
|
|
330
|
+
interface EvalCellResult {
|
|
331
|
+
caseId: string;
|
|
332
|
+
category?: string;
|
|
333
|
+
notes?: string;
|
|
334
|
+
prompt?: string;
|
|
335
|
+
expectations?: string[];
|
|
336
|
+
cellLabel: string;
|
|
337
|
+
axes?: Record<string, string>;
|
|
338
|
+
/** Reference tool-call trajectory for metric computation. */
|
|
339
|
+
reference_trajectory?: TabularToolCall[];
|
|
340
|
+
/** Human ratings keyed by metric name for judge calibration. */
|
|
341
|
+
human_ratings?: Record<string, number>;
|
|
342
|
+
assertionStats: EvalAssertionStat[];
|
|
343
|
+
adapterErrors: number;
|
|
344
|
+
/** Passed all behavioral assertion thresholds for this cell. */
|
|
345
|
+
behavioralPass: boolean;
|
|
346
|
+
/** Passed all outcome expectations when graded; omitted if not graded. */
|
|
347
|
+
outcomePass?: boolean;
|
|
348
|
+
repetitions: EvalRepetition[];
|
|
349
|
+
}
|
|
350
|
+
/**
|
|
351
|
+
* Top-level document for CI/CD pipelines, APIs, and databases.
|
|
352
|
+
*
|
|
353
|
+
* This is the interchange format your storage layer should target — not
|
|
354
|
+
* {@link import("./stream").StreamEvent} or OTLP traces.
|
|
355
|
+
*/
|
|
356
|
+
interface EvalRunEnvelope {
|
|
357
|
+
schemaVersion: typeof EVAL_RUN_SCHEMA_VERSION;
|
|
358
|
+
runId: string;
|
|
359
|
+
startedAt: string;
|
|
360
|
+
durationMs: number;
|
|
361
|
+
suite?: SuiteReference;
|
|
362
|
+
harness: HarnessInfo;
|
|
363
|
+
provenance?: EvalProvenance;
|
|
364
|
+
summary: EvalRunSummary;
|
|
365
|
+
cells: EvalCellResult[];
|
|
366
|
+
}
|
|
367
|
+
interface BuildEvalRunEnvelopeOptions {
|
|
368
|
+
/** UUID for this run; generated if omitted. */
|
|
369
|
+
runId?: string;
|
|
370
|
+
suite?: SuiteReference;
|
|
371
|
+
harness?: Partial<HarnessInfo>;
|
|
372
|
+
provenance?: EvalProvenance;
|
|
373
|
+
/** Merge outcome grades from `gradeReport()` or compatible structure. */
|
|
374
|
+
grading?: {
|
|
375
|
+
gradedAt?: string;
|
|
376
|
+
sourceReport?: string;
|
|
377
|
+
results: Array<{
|
|
378
|
+
caseId: string;
|
|
379
|
+
cellLabel: string;
|
|
380
|
+
repetitionIndex: number;
|
|
381
|
+
expectations: GradedExpectation[];
|
|
382
|
+
summary: GradingSummary;
|
|
383
|
+
evalFeedback?: OutcomeGrades["evalFeedback"];
|
|
384
|
+
graderError?: string;
|
|
385
|
+
durationMs?: number;
|
|
386
|
+
}>;
|
|
387
|
+
judge?: JudgeInfo;
|
|
388
|
+
};
|
|
389
|
+
/** Include transcript in each repetition's artifacts. Default true. */
|
|
390
|
+
includeTranscript?: boolean;
|
|
391
|
+
/** Include raw stream events when adapter provides them. Default false. */
|
|
392
|
+
includeRawStreamEvents?: boolean;
|
|
393
|
+
}
|
|
394
|
+
//#endregion
|
|
395
|
+
//#region src/trajectory/builder.d.ts
|
|
396
|
+
declare class TrajectoryBuilder {
|
|
397
|
+
private meta;
|
|
398
|
+
private sessionStartTs;
|
|
399
|
+
private turns;
|
|
400
|
+
private allToolCalls;
|
|
401
|
+
/**
|
|
402
|
+
* tool_use_id → ToolCall, for matching results back to calls.
|
|
403
|
+
* Entries are removed once a result is observed.
|
|
404
|
+
*/
|
|
405
|
+
private pendingCalls;
|
|
406
|
+
private retries;
|
|
407
|
+
private finalUsage;
|
|
408
|
+
private finalCostUsd;
|
|
409
|
+
private finalDurationMs;
|
|
410
|
+
private finalNumTurns;
|
|
411
|
+
private finalResultText;
|
|
412
|
+
private sawResultEvent;
|
|
413
|
+
private resultIsError;
|
|
414
|
+
/**
|
|
415
|
+
* Consume one event. Safe to call with events in stream order.
|
|
416
|
+
*
|
|
417
|
+
* Unknown event types are silently ignored — the schema evolves and we
|
|
418
|
+
* don't want CI to break on a new event type we haven't modelled.
|
|
419
|
+
*/
|
|
420
|
+
consume(event: StreamEvent): void;
|
|
421
|
+
/**
|
|
422
|
+
* Finalize the view. Call after consuming the last event from the stream.
|
|
423
|
+
*
|
|
424
|
+
* Throws if no `system/init` was observed — at that point we have no model,
|
|
425
|
+
* no session id, and no available-tools list, which means assertions like
|
|
426
|
+
* "called any mcp__api__* tool" can't even be evaluated meaningfully.
|
|
427
|
+
*/
|
|
428
|
+
build(): TrajectoryView;
|
|
429
|
+
private handleAssistantMessage;
|
|
430
|
+
private handleUserMessage;
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Convenience: drain an async iterable of events through a fresh builder.
|
|
434
|
+
*
|
|
435
|
+
* Suitable when you have the full event stream and just want the view.
|
|
436
|
+
* For interactive/incremental scenarios (e.g. surfacing partial state in a UI)
|
|
437
|
+
* instantiate {@link TrajectoryBuilder} directly and call `consume()` /
|
|
438
|
+
* `build()` yourself.
|
|
439
|
+
*/
|
|
440
|
+
declare function buildTrajectory(events: AsyncIterable<StreamEvent>): Promise<TrajectoryView>;
|
|
441
|
+
//#endregion
|
|
442
|
+
//#region src/parsers/stream-json.d.ts
|
|
443
|
+
/**
|
|
444
|
+
* Result of attempting to parse a single line.
|
|
445
|
+
*
|
|
446
|
+
* Successful parses yield `{ ok: true }` with the typed event and the raw line
|
|
447
|
+
* (kept for diagnostics and OTel `events.attributes.raw`). Failed parses yield
|
|
448
|
+
* `{ ok: false }` with the parse error and the raw line — callers can log,
|
|
449
|
+
* skip, or fail the run as they see fit.
|
|
450
|
+
*/
|
|
451
|
+
type ParseResult = {
|
|
452
|
+
ok: true;
|
|
453
|
+
event: StreamEvent;
|
|
454
|
+
rawLine: string;
|
|
455
|
+
} | {
|
|
456
|
+
ok: false;
|
|
457
|
+
error: Error;
|
|
458
|
+
rawLine: string;
|
|
459
|
+
};
|
|
460
|
+
/**
|
|
461
|
+
* Parse a readable stream of NDJSON into a sequence of typed stream-json events.
|
|
462
|
+
*
|
|
463
|
+
* @example
|
|
464
|
+
* const child = spawn("claude", ["-p", prompt, "--output-format", "stream-json", "--verbose"]);
|
|
465
|
+
* for await (const result of parseStreamJson(child.stdout)) {
|
|
466
|
+
* if (result.ok) builder.consume(result.event);
|
|
467
|
+
* else console.warn("malformed stream line:", result.rawLine, result.error);
|
|
468
|
+
* }
|
|
469
|
+
*/
|
|
470
|
+
declare function parseStreamJson(stream: Readable): AsyncGenerator<ParseResult, void, void>;
|
|
471
|
+
//#endregion
|
|
472
|
+
//#region src/assertions/evaluator.d.ts
|
|
473
|
+
/**
|
|
474
|
+
* Evaluate one assertion against a trajectory view.
|
|
475
|
+
*
|
|
476
|
+
* The switch is exhaustive — TypeScript's `never` check at the end will
|
|
477
|
+
* flag any new variant added to the `Assertion` union that hasn't been
|
|
478
|
+
* wired up here.
|
|
479
|
+
*/
|
|
480
|
+
declare function evaluate(view: TrajectoryView, assertion: Assertion): AssertionResult;
|
|
481
|
+
/**
|
|
482
|
+
* Evaluate a list of assertions independently. Used at the test-case level
|
|
483
|
+
* where each top-level assertion is reported separately (and thresholded
|
|
484
|
+
* separately, in the runner layer).
|
|
485
|
+
*/
|
|
486
|
+
declare function evaluateAll(view: TrajectoryView, assertions: Assertion[]): AssertionResult[];
|
|
487
|
+
//#endregion
|
|
488
|
+
//#region src/adapters/registry.d.ts
|
|
489
|
+
/**
|
|
490
|
+
* Register a harness adapter by id.
|
|
491
|
+
*
|
|
492
|
+
* Duplicate ids throw — registration is explicit so accidental overrides
|
|
493
|
+
* surface immediately during startup or test setup.
|
|
494
|
+
*/
|
|
495
|
+
declare function registerAdapter(id: string, adapter: HarnessAdapter): void;
|
|
496
|
+
/** Return all registered adapter ids (built-in and runtime). */
|
|
497
|
+
declare function listAdapters(): string[];
|
|
498
|
+
/** Resolve an adapter by id. Throws if unknown. */
|
|
499
|
+
declare function getAdapter(id: string): HarnessAdapter;
|
|
500
|
+
/** Default adapter when YAML omits `adapter`. */
|
|
501
|
+
declare const DEFAULT_ADAPTER_ID = "claude-code";
|
|
502
|
+
declare function getDefaultAdapter(): HarnessAdapter;
|
|
503
|
+
//#endregion
|
|
504
|
+
//#region src/runner/case.d.ts
|
|
505
|
+
/** Default repetition count when `case.repetitions` is omitted. */
|
|
506
|
+
declare const DEFAULT_REPETITIONS = 5;
|
|
507
|
+
/** Default assertion pass-rate threshold when `threshold` is omitted. */
|
|
508
|
+
declare const DEFAULT_THRESHOLD = 1;
|
|
509
|
+
type AdapterRunFn = (config: BaseAdapterConfig & Record<string, unknown>) => Promise<AdapterResult>;
|
|
510
|
+
/**
|
|
511
|
+
* Build the effective adapter config for one (suite, case, cell).
|
|
512
|
+
*
|
|
513
|
+
* Merge order (later wins): defaultConfig < case.config < cell.config.
|
|
514
|
+
*/
|
|
515
|
+
declare function mergeConfig(suite: TestSuite, testCase: TestCase, cell: MatrixCell): BaseAdapterConfig & Record<string, unknown>;
|
|
516
|
+
declare function getRepetitions(testCase: TestCase): number;
|
|
517
|
+
declare function runRepetition(testCase: TestCase, _cell: MatrixCell, config: BaseAdapterConfig & Record<string, unknown>, repetitionIndex: number, run: AdapterRunFn, signal?: AbortSignal): Promise<RepetitionResult>;
|
|
518
|
+
declare function aggregateCell(testCase: TestCase, cell: MatrixCell, repetitions: RepetitionResult[]): CellReport;
|
|
519
|
+
//#endregion
|
|
520
|
+
//#region src/runner/limit.d.ts
|
|
521
|
+
/**
|
|
522
|
+
* Promise-based concurrency limiter.
|
|
523
|
+
*
|
|
524
|
+
* Functionally equivalent to the `p-limit` package, inlined to avoid an
|
|
525
|
+
* external dependency for ~20 lines of code.
|
|
526
|
+
*
|
|
527
|
+
* Usage:
|
|
528
|
+
*
|
|
529
|
+
* const limit = createLimit(4);
|
|
530
|
+
* const results = await Promise.all(tasks.map(t => limit(() => run(t))));
|
|
531
|
+
*
|
|
532
|
+
* The limiter is unbounded in queue depth — it doesn't push back on the
|
|
533
|
+
* caller. If you need bounded enqueue, wrap it.
|
|
534
|
+
*/
|
|
535
|
+
/** A function that runs an async task under the concurrency limit. */
|
|
536
|
+
type LimitedRunner = <T>(fn: () => Promise<T>) => Promise<T>;
|
|
537
|
+
declare function createLimit(max: number): LimitedRunner;
|
|
538
|
+
//#endregion
|
|
539
|
+
//#region src/otel/types.d.ts
|
|
540
|
+
/**
|
|
541
|
+
* Minimal OTLP JSON types for trace export.
|
|
542
|
+
*
|
|
543
|
+
* Shapes follow OTLP/HTTP JSON Protobuf encoding (lowerCamelCase field names).
|
|
544
|
+
* @see https://opentelemetry.io/docs/specs/otlp/
|
|
545
|
+
*/
|
|
546
|
+
interface ExportTraceServiceRequest {
|
|
547
|
+
resourceSpans: ResourceSpans[];
|
|
548
|
+
}
|
|
549
|
+
interface ResourceSpans {
|
|
550
|
+
resource: Resource;
|
|
551
|
+
scopeSpans: ScopeSpans[];
|
|
552
|
+
}
|
|
553
|
+
interface Resource {
|
|
554
|
+
attributes: KeyValue[];
|
|
555
|
+
}
|
|
556
|
+
interface ScopeSpans {
|
|
557
|
+
scope: InstrumentationScope;
|
|
558
|
+
spans: Span[];
|
|
559
|
+
}
|
|
560
|
+
interface InstrumentationScope {
|
|
561
|
+
name: string;
|
|
562
|
+
version?: string;
|
|
563
|
+
}
|
|
564
|
+
interface Span {
|
|
565
|
+
traceId: string;
|
|
566
|
+
spanId: string;
|
|
567
|
+
parentSpanId?: string;
|
|
568
|
+
name: string;
|
|
569
|
+
kind: number;
|
|
570
|
+
startTimeUnixNano: string;
|
|
571
|
+
endTimeUnixNano: string;
|
|
572
|
+
attributes: KeyValue[];
|
|
573
|
+
status?: SpanStatus;
|
|
574
|
+
}
|
|
575
|
+
interface SpanStatus {
|
|
576
|
+
code: number;
|
|
577
|
+
message?: string;
|
|
578
|
+
}
|
|
579
|
+
interface KeyValue {
|
|
580
|
+
key: string;
|
|
581
|
+
value: AnyValue;
|
|
582
|
+
}
|
|
583
|
+
interface AnyValue {
|
|
584
|
+
stringValue?: string;
|
|
585
|
+
boolValue?: boolean;
|
|
586
|
+
intValue?: string;
|
|
587
|
+
doubleValue?: number;
|
|
588
|
+
bytesValue?: string;
|
|
589
|
+
arrayValue?: ArrayValue;
|
|
590
|
+
kvlistValue?: KeyValueList;
|
|
591
|
+
}
|
|
592
|
+
interface ArrayValue {
|
|
593
|
+
values: AnyValue[];
|
|
594
|
+
}
|
|
595
|
+
interface KeyValueList {
|
|
596
|
+
values: KeyValue[];
|
|
597
|
+
}
|
|
598
|
+
interface EmitOtelOptions {
|
|
599
|
+
/** User prompt for the first `gen_ai.input.messages` entry. */
|
|
600
|
+
prompt?: string;
|
|
601
|
+
/** `gen_ai.agent.name` on the root span. Default: `claude-code`. */
|
|
602
|
+
agentName?: string;
|
|
603
|
+
/** `gen_ai.provider.name`. Default: `anthropic`. */
|
|
604
|
+
providerName?: string;
|
|
605
|
+
/** Resource `service.name`. Default: `harness-eval`. */
|
|
606
|
+
serviceName?: string;
|
|
607
|
+
/** Instrumentation scope name. Default: `@alis-build/harness-eval`. */
|
|
608
|
+
instrumentationScope?: string;
|
|
609
|
+
/**
|
|
610
|
+
* Wall-clock end time for the trace (ms). Defaults to `Date.now()`.
|
|
611
|
+
* Start is derived from `view.usage.durationMs`.
|
|
612
|
+
*/
|
|
613
|
+
endTimeMs?: number;
|
|
614
|
+
}
|
|
615
|
+
//#endregion
|
|
616
|
+
//#region src/otel/emitter.d.ts
|
|
617
|
+
/**
|
|
618
|
+
* Map a {@link TrajectoryView} to OTLP trace JSON.
|
|
619
|
+
*
|
|
620
|
+
* Span tree (siblings under `invoke_agent`, not nested):
|
|
621
|
+
* ```
|
|
622
|
+
* invoke_agent
|
|
623
|
+
* ├── chat {model}
|
|
624
|
+
* ├── execute_tool {name}
|
|
625
|
+
* ├── chat {model}
|
|
626
|
+
* └── ...
|
|
627
|
+
* ```
|
|
628
|
+
*/
|
|
629
|
+
declare function trajectoryToOtlp(view: TrajectoryView, options?: EmitOtelOptions): ExportTraceServiceRequest;
|
|
630
|
+
/** Alias matching the implementation plan naming. */
|
|
631
|
+
declare const emitOtel: typeof trajectoryToOtlp;
|
|
632
|
+
//#endregion
|
|
633
|
+
//#region src/grader/grade-report.d.ts
|
|
634
|
+
declare function gradeReport(report: SuiteReport, options?: GradeReportOptions): Promise<SuiteGradingReport>;
|
|
635
|
+
//#endregion
|
|
636
|
+
//#region src/grader/resolve-grade-options.d.ts
|
|
637
|
+
interface GradeCliOverrides {
|
|
638
|
+
model?: string;
|
|
639
|
+
binary?: string;
|
|
640
|
+
timeoutMs?: number;
|
|
641
|
+
maxConcurrent?: number;
|
|
642
|
+
expectationsPath?: string;
|
|
643
|
+
sourceReport?: string;
|
|
644
|
+
}
|
|
645
|
+
/**
|
|
646
|
+
* Merge standalone grading YAML with CLI flags (CLI wins).
|
|
647
|
+
*/
|
|
648
|
+
declare function resolveGradeOptions(fileConfig?: GradingConfig, cli?: GradeCliOverrides, configPath?: string): GradeReportOptions;
|
|
649
|
+
//#endregion
|
|
650
|
+
//#region src/grader/transcript.d.ts
|
|
651
|
+
declare function trajectoryToTranscript(view: TrajectoryView, prompt?: string): string;
|
|
652
|
+
//#endregion
|
|
653
|
+
//#region src/grader/claude-grader.d.ts
|
|
654
|
+
interface ClaudeGraderOptions {
|
|
655
|
+
binary?: string;
|
|
656
|
+
model?: string;
|
|
657
|
+
timeoutMs?: number;
|
|
658
|
+
env?: Record<string, string>;
|
|
659
|
+
cwd?: string;
|
|
660
|
+
claudeCode?: ClaudeCodeOptions;
|
|
661
|
+
}
|
|
662
|
+
declare function createClaudeGrader(options?: ClaudeGraderOptions): GraderFn;
|
|
663
|
+
//#endregion
|
|
664
|
+
//#region src/grader/format-console.d.ts
|
|
665
|
+
declare function formatGradingConsole(report: SuiteGradingReport, color?: boolean): string;
|
|
666
|
+
declare function gradingReportPassed(report: SuiteGradingReport): boolean;
|
|
667
|
+
//#endregion
|
|
668
|
+
//#region src/reporter/types.d.ts
|
|
669
|
+
type ReportFormat = "console" | "markdown" | "json";
|
|
670
|
+
interface ReporterOptions {
|
|
671
|
+
format: ReportFormat;
|
|
672
|
+
baseline?: SuiteReport;
|
|
673
|
+
color?: boolean;
|
|
674
|
+
}
|
|
675
|
+
//#endregion
|
|
676
|
+
//#region src/reporter/index.d.ts
|
|
677
|
+
declare function formatReport(report: SuiteReport, options: ReporterOptions): string;
|
|
678
|
+
//#endregion
|
|
679
|
+
//#region src/eval-record/build.d.ts
|
|
680
|
+
/**
|
|
681
|
+
* Convert a {@link SuiteReport} (and optional grading) into a versioned
|
|
682
|
+
* {@link EvalRunEnvelope} for storage or API handoff.
|
|
683
|
+
*/
|
|
684
|
+
declare function buildEvalRunEnvelope(report: SuiteReport, options?: BuildEvalRunEnvelopeOptions): EvalRunEnvelope;
|
|
685
|
+
/** Build envelope from on-disk report + optional grading JSON paths. */
|
|
686
|
+
declare function buildEvalRunEnvelopeFromFiles(reportPath: string, options?: BuildEvalRunEnvelopeOptions & {
|
|
687
|
+
gradingPath?: string;
|
|
688
|
+
suitePath?: string;
|
|
689
|
+
}): Promise<EvalRunEnvelope>;
|
|
690
|
+
//#endregion
|
|
691
|
+
//#region src/metrics/tool-calls.d.ts
|
|
692
|
+
interface ToolCallMetricOptions {
|
|
693
|
+
useStrictStringMatch?: boolean;
|
|
694
|
+
}
|
|
695
|
+
type ToolCallInput = InterchangeToolCall | TabularToolCall | {
|
|
696
|
+
tool_name: string;
|
|
697
|
+
tool_input: unknown;
|
|
698
|
+
};
|
|
699
|
+
declare function toolCallValid(toolCall: ToolCallInput): number;
|
|
700
|
+
declare function toolNameMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
|
|
701
|
+
declare function toolParameterKeyMatch(predicted: ToolCallInput, reference: ToolCallInput): number;
|
|
702
|
+
declare function toolParameterKvMatch(predicted: ToolCallInput, reference: ToolCallInput, options?: ToolCallMetricOptions): number;
|
|
703
|
+
declare function computeToolCallMetrics(predicted: ToolCallInput[], reference: ToolCallInput[], options?: ToolCallMetricOptions): ToolCallMetrics;
|
|
704
|
+
//#endregion
|
|
705
|
+
//#region src/eval-interchange/projections.d.ts
|
|
706
|
+
declare function toTrajectory(envelope: EvalRunEnvelope): EvalDatasetRow[];
|
|
707
|
+
declare function toProtoInstances(envelope: EvalRunEnvelope): ProtoTrajectoryInstance[];
|
|
708
|
+
declare function toAgentTrace(envelope: EvalRunEnvelope): AgentTrace[];
|
|
709
|
+
declare function enrichRepetitionWithInterchange(repetition: EvalRepetition, referenceTrajectory?: TabularToolCall[]): EvalRepetition;
|
|
710
|
+
//#endregion
|
|
711
|
+
//#region src/metrics/trajectory.d.ts
|
|
712
|
+
type TrajectoryInput = InterchangeToolCall[] | TabularToolCall[] | Array<{
|
|
713
|
+
tool_name: string;
|
|
714
|
+
tool_input: unknown;
|
|
715
|
+
}>;
|
|
716
|
+
declare function trajectoryExactMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
717
|
+
declare function trajectoryInOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
718
|
+
declare function trajectoryAnyOrderMatch(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
719
|
+
declare function trajectoryPrecision(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
720
|
+
declare function trajectoryRecall(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
721
|
+
declare function trajectorySingleToolUse(predicted: TrajectoryInput, reference: TrajectoryInput): number;
|
|
722
|
+
declare function computeTrajectoryMetrics(predicted: TrajectoryInput, reference: TrajectoryInput): TrajectoryMetrics;
|
|
723
|
+
//#endregion
|
|
724
|
+
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type AdapterRunFn, AgentConfig, AgentEvent, AgentTrace, Assertion, AssertionResult, AssertionStat, AssistantMessage, AssistantMessageEvent, AssistantTurn, type BaseAdapterConfig, BuildEvalRunEnvelopeOptions, Cardinality, CellReport, CompoundPredicate, ConfigError, ContentBlock, ContentPart, ConversationTurn, DEFAULT_ADAPTER_ID, DEFAULT_REPETITIONS, DEFAULT_THRESHOLD, EVAL_RUN_SCHEMA_VERSION, type EmitOtelOptions, EvalArtifacts, EvalAssertionStat, EvalCellResult, EvalDatasetRow, EvalProvenance, EvalRepetition, EvalRunEnvelope, EvalRunSummary, type ExportTraceServiceRequest, ExternalScore, type GradeReportOptions, type HarnessAdapter, HarnessInfo, InterchangeToolCall, InterchangeTrajectory, JudgeInfo, LeafPredicate, type LimitedRunner, MatrixCell, McpServerStatus, ObjectPredicate, OutcomeGrades, type ParseErrorRecord, type ParseResult, Predicate, ProgressCallback, ProgressEvent, ProtoTrajectoryInstance, type RepGradingResult, RepetitionError, RepetitionResult, type ReporterOptions, ResultEvent, RetryRecord, RunSuiteOptions, SessionMeta, StopReason, StreamEvent, type SuiteConfig, type SuiteGradingReport, SuiteReference, SuiteReport, SystemCompactBoundaryEvent, SystemInitEvent, SystemPluginInstallEvent, SystemRetryEvent, SystemUnknownEvent, TRAJECTORY_SCHEMA_VERSION, TabularToolCall, TestCase, TestSuite, TextBlock, ThresholdedAssertion, ToolCall, type ToolCallMetricOptions, ToolCallMetrics, ToolPattern, ToolResultBlock, ToolUseBlock, TrajectoryBuilder, type TrajectoryInput, TrajectoryMetrics, TrajectoryView, Usage, UsageSummary, UserMessage, UserMessageEvent, aggregateCell, buildEvalRunEnvelope, buildEvalRunEnvelopeFromFiles, buildTrajectory, index_d_exports as claudeCode, computeToolCallMetrics, computeTrajectoryMetrics, createClaudeGrader, createLimit, emitOtel, enrichRepetitionWithInterchange, evaluate, evaluateAll, formatGradingConsole, formatReport, getAdapter, getDefaultAdapter, getRepetitions, gradeReport, gradingReportPassed, isAssistantMessage, isResult, isSystemInit, isSystemRetry, isTextBlock, isToolResultBlock, isToolUseBlock, isUserMessage, listAdapters, loadSuite, mergeConfig, namespaceOf, parseStreamJson, parseSuite, registerAdapter, resolveGradeOptions, runRepetition, runSuite, toAgentTrace, toProtoInstances, toTrajectory, toolCallValid, toolNameMatch, toolParameterKeyMatch, toolParameterKvMatch, trajectoryAnyOrderMatch, trajectoryExactMatch, trajectoryInOrderMatch, trajectoryPrecision, trajectoryRecall, trajectorySingleToolUse, trajectoryToOtlp, trajectoryToTranscript };
|
|
725
|
+
//# sourceMappingURL=index.d.ts.map
|