@plaited/agent-eval-harness 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -0
- package/package.json +1 -1
- package/src/commands/balance.ts +1 -11
- package/src/commands/calibrate.ts +2 -10
- package/src/commands/capture.ts +104 -114
- package/src/commands/execution.ts +245 -0
- package/src/commands/tests/capture-cli.spec.ts +84 -0
- package/src/commands/tests/trials-cli.spec.ts +68 -0
- package/src/commands/trials.ts +98 -115
- package/src/commands/validate-refs.ts +3 -19
- package/src/core/core.ts +27 -1
- package/src/core/loading.ts +53 -19
- package/src/core/streaming.ts +172 -0
- package/src/core/tests/streaming.spec.ts +399 -0
- package/src/core/tests/worker-pool.spec.ts +377 -0
- package/src/core/worker-pool.ts +220 -0
- package/src/core.ts +15 -0
- package/src/schemas/grader-loader.ts +23 -6
- package/src/schemas/schemas-cli.ts +1 -6
- package/src/schemas/schemas.ts +2 -0
- package/src/schemas.ts +1 -1
package/src/commands/trials.ts
CHANGED
|
@@ -12,13 +12,12 @@
|
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
14
|
import { parseArgs } from 'node:util'
|
|
15
|
-
import { extractOutput, extractTrajectory,
|
|
16
|
-
import { type HeadlessAdapterConfig, parseHeadlessConfig } from '../headless/headless.schemas.ts'
|
|
15
|
+
import { createWorkspaceDir, extractOutput, extractTrajectory, logProgress, readStdinPrompts } from '../core.ts'
|
|
17
16
|
import type { ParsedUpdate } from '../headless/headless-output-parser.ts'
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import type
|
|
17
|
+
import { DEFAULT_TRIAL_COUNT } from '../schemas/constants.ts'
|
|
18
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
19
|
+
import type { PromptCase, TrialEntry, TrialResult } from '../schemas.ts'
|
|
20
|
+
import { type BaseExecutionConfig, executePrompts, parseConcurrency, prepareExecution } from './execution.ts'
|
|
22
21
|
|
|
23
22
|
// ============================================================================
|
|
24
23
|
// Pass@k/Pass^k Calculation
|
|
@@ -74,27 +73,9 @@ export const calculatePassExpK = (passes: number, k: number): number => {
|
|
|
74
73
|
// ============================================================================
|
|
75
74
|
|
|
76
75
|
/** Configuration for trials command */
|
|
77
|
-
export type TrialsConfig = {
|
|
78
|
-
/** Path to prompts.jsonl file */
|
|
79
|
-
promptsPath: string
|
|
80
|
-
/** Path to agent schema JSON file */
|
|
81
|
-
schemaPath: string
|
|
76
|
+
export type TrialsConfig = BaseExecutionConfig & {
|
|
82
77
|
/** Number of trials per prompt */
|
|
83
78
|
k: number
|
|
84
|
-
/** Output file path */
|
|
85
|
-
outputPath?: string
|
|
86
|
-
/** Working directory for agent */
|
|
87
|
-
cwd?: string
|
|
88
|
-
/** Timeout per prompt in milliseconds (overrides schema default) */
|
|
89
|
-
timeout?: number
|
|
90
|
-
/** Show progress to stderr */
|
|
91
|
-
progress?: boolean
|
|
92
|
-
/** Append to output file */
|
|
93
|
-
append?: boolean
|
|
94
|
-
/** Optional grader function */
|
|
95
|
-
grader?: Grader
|
|
96
|
-
/** Enable debug mode */
|
|
97
|
-
debug?: boolean
|
|
98
79
|
}
|
|
99
80
|
|
|
100
81
|
// ============================================================================
|
|
@@ -108,79 +89,38 @@ export type TrialsConfig = {
|
|
|
108
89
|
* @returns Array of trial results
|
|
109
90
|
*/
|
|
110
91
|
export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> => {
|
|
111
|
-
const {
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
k,
|
|
115
|
-
outputPath,
|
|
116
|
-
cwd,
|
|
117
|
-
timeout,
|
|
118
|
-
progress = false,
|
|
119
|
-
append = false,
|
|
120
|
-
grader,
|
|
121
|
-
debug = false,
|
|
122
|
-
} = config
|
|
92
|
+
const { k } = config
|
|
93
|
+
const ctx = await prepareExecution(config)
|
|
94
|
+
const { schema, prompts, sessions, resolvedWorkspaceDir, defaultWorkingDir, progress, grader } = ctx
|
|
123
95
|
|
|
124
|
-
//
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
96
|
+
// Log progress info
|
|
97
|
+
logProgress(`Loaded ${prompts.length} prompts from ${config.promptsPath ?? 'stdin'}`, progress)
|
|
98
|
+
logProgress(`Running ${k} trials per prompt (${prompts.length * k} total executions)`, progress)
|
|
99
|
+
logProgress(`Schema: ${schema.name} (${config.schemaPath})`, progress)
|
|
100
|
+
logProgress(`Timeout: ${ctx.effectiveTimeout}ms`, progress)
|
|
101
|
+
if (ctx.concurrency > 1) {
|
|
102
|
+
logProgress(`Concurrency: ${ctx.concurrency} workers`, progress)
|
|
128
103
|
}
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
try {
|
|
132
|
-
const rawSchema = await schemaFile.json()
|
|
133
|
-
schema = parseHeadlessConfig(rawSchema)
|
|
134
|
-
} catch (error) {
|
|
135
|
-
throw new Error(`Invalid schema: ${error instanceof Error ? error.message : String(error)}`)
|
|
104
|
+
if (resolvedWorkspaceDir) {
|
|
105
|
+
logProgress(`Workspace: ${resolvedWorkspaceDir}`, progress)
|
|
136
106
|
}
|
|
137
|
-
|
|
138
|
-
// Load prompts
|
|
139
|
-
const prompts = await loadPrompts(promptsPath)
|
|
140
|
-
|
|
141
|
-
// Resolve output path
|
|
142
|
-
const resolvedOutputPath = outputPath ? resolvePath(outputPath) : undefined
|
|
143
|
-
|
|
144
|
-
// Determine effective timeout (CLI flag > schema default > harness default)
|
|
145
|
-
const schemaTimeout = 'timeout' in schema ? schema.timeout : undefined
|
|
146
|
-
const effectiveTimeout = timeout ?? schemaTimeout ?? DEFAULT_HARNESS_TIMEOUT
|
|
147
|
-
|
|
148
|
-
// Log progress info
|
|
149
|
-
logProgress(`Loaded ${prompts.length} prompts from ${promptsPath}`, progress)
|
|
150
|
-
logProgress(`Running ${k} trials per prompt`, progress)
|
|
151
|
-
logProgress(`Schema: ${schema.name} (${schemaPath})`, progress)
|
|
152
|
-
logProgress(`Timeout: ${effectiveTimeout}ms`, progress)
|
|
153
107
|
if (grader) {
|
|
154
108
|
logProgress('Grader: enabled (will compute pass@k metrics)', progress)
|
|
155
109
|
}
|
|
156
110
|
|
|
157
|
-
//
|
|
158
|
-
const
|
|
159
|
-
|
|
160
|
-
timeout: effectiveTimeout,
|
|
161
|
-
verbose: progress,
|
|
162
|
-
debug,
|
|
163
|
-
})
|
|
164
|
-
|
|
165
|
-
// Clear output file if not appending
|
|
166
|
-
if (resolvedOutputPath && !append) {
|
|
167
|
-
await Bun.write(resolvedOutputPath, '')
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
const workingDir = cwd ?? process.cwd()
|
|
171
|
-
const results: TrialResult[] = []
|
|
172
|
-
let isFirstOutput = true
|
|
173
|
-
|
|
174
|
-
// Run evaluations
|
|
175
|
-
for (let i = 0; i < prompts.length; i++) {
|
|
176
|
-
const promptCase = prompts[i]
|
|
177
|
-
if (!promptCase) continue
|
|
178
|
-
|
|
179
|
-
logProgress(`[${i + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
|
|
111
|
+
// Process all trials for a single prompt
|
|
112
|
+
const processPromptTrials = async (promptCase: (typeof prompts)[number], index: number): Promise<TrialResult> => {
|
|
113
|
+
logProgress(`[${index + 1}/${prompts.length}] ${promptCase.id}: Running ${k} trials...`, progress)
|
|
180
114
|
|
|
181
115
|
const trialEntries: TrialEntry[] = []
|
|
182
116
|
|
|
183
117
|
for (let trialNum = 1; trialNum <= k; trialNum++) {
|
|
118
|
+
// Determine working directory (per-prompt workspace or default)
|
|
119
|
+
// For trials, include trial number in workspace path for isolation
|
|
120
|
+
const workingDir = resolvedWorkspaceDir
|
|
121
|
+
? await createWorkspaceDir(resolvedWorkspaceDir, `${promptCase.id}-trial-${trialNum}`)
|
|
122
|
+
: defaultWorkingDir
|
|
123
|
+
|
|
184
124
|
// Create fresh session for each trial
|
|
185
125
|
const session = await sessions.create(workingDir)
|
|
186
126
|
const startTime = Date.now()
|
|
@@ -190,8 +130,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
190
130
|
const inputs = Array.isArray(promptCase.input) ? promptCase.input : [promptCase.input]
|
|
191
131
|
const allUpdates: ParsedUpdate[] = []
|
|
192
132
|
|
|
193
|
-
// TODO: Per-prompt timeout from promptCase.timeout is documented but not yet implemented
|
|
194
|
-
|
|
195
133
|
// Execute each turn sequentially
|
|
196
134
|
for (const turnInput of inputs) {
|
|
197
135
|
const turnResult = await sessions.prompt(session.id, turnInput)
|
|
@@ -223,7 +161,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
223
161
|
entry.score = graderResult.score
|
|
224
162
|
entry.reasoning = graderResult.reasoning
|
|
225
163
|
|
|
226
|
-
// Merge outcome from grader if present
|
|
227
164
|
if (graderResult.outcome) {
|
|
228
165
|
entry.outcome = graderResult.outcome
|
|
229
166
|
}
|
|
@@ -234,9 +171,6 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
234
171
|
` Trial ${trialNum}/${k}: ${entry.pass !== undefined ? (entry.pass ? '✓' : '✗') : '?'}`,
|
|
235
172
|
progress,
|
|
236
173
|
)
|
|
237
|
-
|
|
238
|
-
// Clean up session
|
|
239
|
-
sessions.destroy(session.id)
|
|
240
174
|
} catch (error) {
|
|
241
175
|
const endTime = Date.now()
|
|
242
176
|
const message = error instanceof Error ? error.message : String(error)
|
|
@@ -250,6 +184,9 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
250
184
|
reasoning: `Error: ${message}`,
|
|
251
185
|
})
|
|
252
186
|
logProgress(` Trial ${trialNum}/${k}: ! (error)`, progress)
|
|
187
|
+
} finally {
|
|
188
|
+
// Always clean up session
|
|
189
|
+
sessions.destroy(session.id)
|
|
253
190
|
}
|
|
254
191
|
}
|
|
255
192
|
|
|
@@ -260,6 +197,11 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
260
197
|
...(promptCase.hint && { hint: promptCase.hint }),
|
|
261
198
|
k,
|
|
262
199
|
trials: trialEntries,
|
|
200
|
+
metadata: {
|
|
201
|
+
...promptCase.metadata,
|
|
202
|
+
agent: schema.name,
|
|
203
|
+
...(resolvedWorkspaceDir && { workspaceDir: resolvedWorkspaceDir }),
|
|
204
|
+
},
|
|
263
205
|
}
|
|
264
206
|
|
|
265
207
|
// Calculate metrics if grader was used
|
|
@@ -270,23 +212,21 @@ export const runTrials = async (config: TrialsConfig): Promise<TrialResult[]> =>
|
|
|
270
212
|
result.passExpK = calculatePassExpK(passes, k)
|
|
271
213
|
}
|
|
272
214
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
// Write result immediately
|
|
276
|
-
const formatted = JSON.stringify(result)
|
|
277
|
-
await writeOutput(formatted, resolvedOutputPath, !isFirstOutput)
|
|
278
|
-
isFirstOutput = false
|
|
215
|
+
// Write result immediately (coordinated via mutex for concurrent writes)
|
|
216
|
+
await ctx.writeResult(result)
|
|
279
217
|
|
|
280
218
|
if (grader) {
|
|
281
219
|
logProgress(
|
|
282
|
-
` → passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
|
|
220
|
+
` → ${promptCase.id}: passRate=${(result.passRate ?? 0).toFixed(2)}, pass@${k}=${(result.passAtK ?? 0).toFixed(2)}`,
|
|
283
221
|
progress,
|
|
284
222
|
)
|
|
285
223
|
}
|
|
224
|
+
|
|
225
|
+
return result
|
|
286
226
|
}
|
|
287
227
|
|
|
288
|
-
|
|
289
|
-
return
|
|
228
|
+
// Run with worker pool (parallelizes across prompts, trials for each prompt run sequentially)
|
|
229
|
+
return executePrompts(ctx, processPromptTrials)
|
|
290
230
|
}
|
|
291
231
|
|
|
292
232
|
// ============================================================================
|
|
@@ -311,6 +251,9 @@ export const trials = async (args: string[]): Promise<void> => {
|
|
|
311
251
|
append: { type: 'boolean', default: false },
|
|
312
252
|
grader: { type: 'string', short: 'g' },
|
|
313
253
|
debug: { type: 'boolean', default: false },
|
|
254
|
+
stdin: { type: 'boolean', default: false },
|
|
255
|
+
concurrency: { type: 'string', short: 'j' },
|
|
256
|
+
'workspace-dir': { type: 'string' },
|
|
314
257
|
help: { type: 'boolean', short: 'h' },
|
|
315
258
|
},
|
|
316
259
|
allowPositionals: true,
|
|
@@ -319,6 +262,7 @@ export const trials = async (args: string[]): Promise<void> => {
|
|
|
319
262
|
if (values.help) {
|
|
320
263
|
console.log(`
|
|
321
264
|
Usage: agent-eval-harness trials <prompts.jsonl> --schema <schema.json> [options]
|
|
265
|
+
cat prompts.jsonl | agent-eval-harness trials --stdin --schema <schema.json> [options]
|
|
322
266
|
|
|
323
267
|
Arguments:
|
|
324
268
|
prompts.jsonl Input file with evaluation prompts
|
|
@@ -329,6 +273,9 @@ Options:
|
|
|
329
273
|
-k Number of trials per prompt (default: ${DEFAULT_TRIAL_COUNT})
|
|
330
274
|
-c, --cwd Working directory for agent
|
|
331
275
|
-t, --timeout Request timeout in ms (overrides schema default)
|
|
276
|
+
-j, --concurrency Number of concurrent workers (default: 1)
|
|
277
|
+
--stdin Read prompts from stdin (mutually exclusive with file arg)
|
|
278
|
+
--workspace-dir Base directory for per-trial workspace isolation
|
|
332
279
|
--progress Show progress to stderr
|
|
333
280
|
--append Append to output file
|
|
334
281
|
-g, --grader Path to grader (.ts/.js module or executable script)
|
|
@@ -343,22 +290,52 @@ Graders:
|
|
|
343
290
|
TS/JS modules must export a 'grade' function.
|
|
344
291
|
Executable scripts (Python, etc.) use stdin/stdout JSON protocol.
|
|
345
292
|
|
|
293
|
+
Parallelization:
|
|
294
|
+
Use -j/--concurrency to run multiple prompts' trials in parallel.
|
|
295
|
+
Each prompt's k trials still run sequentially (required for aggregation).
|
|
296
|
+
With 151 prompts and -j 4, you get 4 prompts running trials concurrently.
|
|
297
|
+
|
|
298
|
+
Memory: Stream-mode agents (e.g. Claude Code) spawn real subprocesses
|
|
299
|
+
at ~400-500MB RSS each. With -j 8 that is 3-4GB of resident memory.
|
|
300
|
+
In memory-constrained environments (Docker, CI) this can cause OOM kills.
|
|
301
|
+
Use --stdin to pipe prompts for container-level orchestration.
|
|
302
|
+
|
|
303
|
+
Workspace Isolation:
|
|
304
|
+
Use --workspace-dir to create per-trial directories.
|
|
305
|
+
Each trial runs in {workspace-dir}/prompt-{id}-trial-{n}/.
|
|
306
|
+
Useful for code generation tasks requiring filesystem isolation.
|
|
307
|
+
|
|
346
308
|
Examples:
|
|
347
|
-
#
|
|
309
|
+
# Basic trials
|
|
348
310
|
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -o trials.jsonl
|
|
349
311
|
|
|
312
|
+
# Run 4 prompts' trials in parallel (4x faster for 151 prompts)
|
|
313
|
+
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 -o trials.jsonl
|
|
314
|
+
|
|
315
|
+
# With workspace isolation for code generation
|
|
316
|
+
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 -j 4 \\
|
|
317
|
+
--workspace-dir ./workspaces -o trials.jsonl
|
|
318
|
+
|
|
350
319
|
# With TypeScript grader
|
|
351
320
|
agent-eval-harness trials prompts.jsonl -s claude.json -k 5 --grader ./grader.ts -o trials.jsonl
|
|
352
321
|
|
|
353
|
-
#
|
|
354
|
-
agent-eval-harness trials
|
|
322
|
+
# Read prompts from stdin (container orchestration)
|
|
323
|
+
cat prompts.jsonl | agent-eval-harness trials --stdin -s claude.json -k 5 -o trials.jsonl
|
|
355
324
|
`)
|
|
356
325
|
return
|
|
357
326
|
}
|
|
358
327
|
|
|
359
328
|
const promptsPath = positionals[0]
|
|
360
|
-
|
|
361
|
-
|
|
329
|
+
const useStdin = values.stdin ?? false
|
|
330
|
+
|
|
331
|
+
// Mutual exclusivity: --stdin and positional file
|
|
332
|
+
if (useStdin && promptsPath) {
|
|
333
|
+
console.error('Error: --stdin and prompts file argument are mutually exclusive')
|
|
334
|
+
process.exit(1)
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (!useStdin && !promptsPath) {
|
|
338
|
+
console.error('Error: prompts.jsonl path is required (or use --stdin)')
|
|
362
339
|
process.exit(1)
|
|
363
340
|
}
|
|
364
341
|
|
|
@@ -368,19 +345,23 @@ Examples:
|
|
|
368
345
|
process.exit(1)
|
|
369
346
|
}
|
|
370
347
|
|
|
371
|
-
//
|
|
372
|
-
let
|
|
373
|
-
if (
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
348
|
+
// Read prompts from stdin if requested
|
|
349
|
+
let prompts: PromptCase[] | undefined
|
|
350
|
+
if (useStdin) {
|
|
351
|
+
const stdinPrompts = await readStdinPrompts()
|
|
352
|
+
if (!stdinPrompts || stdinPrompts.length === 0) {
|
|
353
|
+
console.error('Error: no prompts received on stdin')
|
|
378
354
|
process.exit(1)
|
|
379
355
|
}
|
|
356
|
+
prompts = stdinPrompts
|
|
380
357
|
}
|
|
381
358
|
|
|
359
|
+
// Load grader if specified
|
|
360
|
+
const grader = values.grader ? await loadGraderOrExit(values.grader) : undefined
|
|
361
|
+
|
|
382
362
|
await runTrials({
|
|
383
|
-
promptsPath,
|
|
363
|
+
promptsPath: promptsPath ?? undefined,
|
|
364
|
+
prompts,
|
|
384
365
|
schemaPath: values.schema,
|
|
385
366
|
k: Number.parseInt(values.k ?? String(DEFAULT_TRIAL_COUNT), 10),
|
|
386
367
|
outputPath: values.output,
|
|
@@ -390,5 +371,7 @@ Examples:
|
|
|
390
371
|
append: values.append ?? false,
|
|
391
372
|
grader,
|
|
392
373
|
debug: values.debug ?? false,
|
|
374
|
+
concurrency: parseConcurrency(values.concurrency),
|
|
375
|
+
workspaceDir: values['workspace-dir'],
|
|
393
376
|
})
|
|
394
377
|
}
|
|
@@ -9,9 +9,9 @@
|
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
11
|
import { parseArgs } from 'node:util'
|
|
12
|
-
import {
|
|
12
|
+
import { loadPrompts, resolvePath } from '../core.ts'
|
|
13
|
+
import { loadGraderOrExit } from '../schemas/grader-loader.ts'
|
|
13
14
|
import type { Grader, ValidationResult } from '../schemas.ts'
|
|
14
|
-
import { loadPrompts } from './capture.ts'
|
|
15
15
|
|
|
16
16
|
// ============================================================================
|
|
17
17
|
// Types
|
|
@@ -27,16 +27,6 @@ export type ValidateRefsConfig = {
|
|
|
27
27
|
grader: Grader
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
// ============================================================================
|
|
31
|
-
// Helpers
|
|
32
|
-
// ============================================================================
|
|
33
|
-
|
|
34
|
-
/** Resolve path relative to process.cwd() */
|
|
35
|
-
const resolvePath = (path: string): string => {
|
|
36
|
-
if (path.startsWith('/')) return path
|
|
37
|
-
return `${process.cwd()}/${path}`
|
|
38
|
-
}
|
|
39
|
-
|
|
40
30
|
// ============================================================================
|
|
41
31
|
// Validate-Refs Implementation
|
|
42
32
|
// ============================================================================
|
|
@@ -171,13 +161,7 @@ Examples:
|
|
|
171
161
|
}
|
|
172
162
|
|
|
173
163
|
// Load grader
|
|
174
|
-
|
|
175
|
-
try {
|
|
176
|
-
grader = await loadGrader(values.grader)
|
|
177
|
-
} catch (error) {
|
|
178
|
-
console.error(`Error: ${error instanceof Error ? error.message : error}`)
|
|
179
|
-
process.exit(1)
|
|
180
|
-
}
|
|
164
|
+
const grader = await loadGraderOrExit(values.grader)
|
|
181
165
|
|
|
182
166
|
await runValidateRefs({
|
|
183
167
|
promptsPath,
|
package/src/core/core.ts
CHANGED
|
@@ -11,9 +11,25 @@
|
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
13
|
// Loading utilities
|
|
14
|
-
export {
|
|
14
|
+
export {
|
|
15
|
+
buildResultsIndex,
|
|
16
|
+
countLines,
|
|
17
|
+
loadJsonl,
|
|
18
|
+
loadPrompts,
|
|
19
|
+
loadResults,
|
|
20
|
+
readStdinPrompts,
|
|
21
|
+
streamResults,
|
|
22
|
+
} from './loading.ts'
|
|
15
23
|
// Output utilities
|
|
16
24
|
export { getInputPreview, headTailPreview, logProgress, resolvePath, writeOutput } from './output.ts'
|
|
25
|
+
// Native streaming utilities
|
|
26
|
+
export {
|
|
27
|
+
countLinesStreaming,
|
|
28
|
+
streamJsonl,
|
|
29
|
+
streamPrompts,
|
|
30
|
+
streamResultsNative,
|
|
31
|
+
streamTrialResults,
|
|
32
|
+
} from './streaming.ts'
|
|
17
33
|
// Trajectory utilities
|
|
18
34
|
export {
|
|
19
35
|
detectTrajectoryRichness,
|
|
@@ -23,3 +39,13 @@ export {
|
|
|
23
39
|
extractTrajectory,
|
|
24
40
|
hasToolErrors,
|
|
25
41
|
} from './trajectory.ts'
|
|
42
|
+
// Worker pool utilities
|
|
43
|
+
export {
|
|
44
|
+
createWorkspaceDir,
|
|
45
|
+
createWriteMutex,
|
|
46
|
+
type ProgressCallback,
|
|
47
|
+
runWorkerPool,
|
|
48
|
+
type WorkerPoolOptions,
|
|
49
|
+
type WorkerPoolResult,
|
|
50
|
+
type WriteMutex,
|
|
51
|
+
} from './worker-pool.ts'
|
package/src/core/loading.ts
CHANGED
|
@@ -39,6 +39,44 @@ export const loadPrompts = async (path: string): Promise<PromptCase[]> => {
|
|
|
39
39
|
})
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
+
/**
|
|
43
|
+
* Read prompts from stdin as JSONL.
|
|
44
|
+
*
|
|
45
|
+
* @remarks
|
|
46
|
+
* Reads all data from stdin, parses each line as JSON, and validates against
|
|
47
|
+
* PromptCaseSchema. Returns null when stdin is a TTY (no piped input).
|
|
48
|
+
* Uses chunked Buffer reads matching the pattern in pipeline/run.ts.
|
|
49
|
+
*
|
|
50
|
+
* @returns Parsed and validated prompt cases, or null if stdin is a TTY
|
|
51
|
+
* @throws Error if any line is invalid JSON or fails schema validation
|
|
52
|
+
*
|
|
53
|
+
* @public
|
|
54
|
+
*/
|
|
55
|
+
export const readStdinPrompts = async (): Promise<PromptCase[] | null> => {
|
|
56
|
+
if (process.stdin.isTTY) {
|
|
57
|
+
return null
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const chunks: Buffer[] = []
|
|
61
|
+
for await (const chunk of process.stdin) {
|
|
62
|
+
chunks.push(chunk)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const content = Buffer.concat(chunks).toString('utf-8').trim()
|
|
66
|
+
if (!content) return null
|
|
67
|
+
|
|
68
|
+
return content
|
|
69
|
+
.split('\n')
|
|
70
|
+
.filter(Boolean)
|
|
71
|
+
.map((line, index) => {
|
|
72
|
+
try {
|
|
73
|
+
return PromptCaseSchema.parse(JSON.parse(line))
|
|
74
|
+
} catch (error) {
|
|
75
|
+
throw new Error(`Invalid stdin prompt at line ${index + 1}: ${error instanceof Error ? error.message : error}`)
|
|
76
|
+
}
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
|
|
42
80
|
/**
|
|
43
81
|
* Load capture results from a JSONL file.
|
|
44
82
|
*
|
|
@@ -99,12 +137,21 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
|
|
|
99
137
|
// Streaming Loading
|
|
100
138
|
// ============================================================================
|
|
101
139
|
|
|
140
|
+
// Re-export native streaming functions for backward compatibility
|
|
141
|
+
export {
|
|
142
|
+
countLinesStreaming,
|
|
143
|
+
streamJsonl,
|
|
144
|
+
streamPrompts,
|
|
145
|
+
streamResultsNative,
|
|
146
|
+
streamTrialResults,
|
|
147
|
+
} from './streaming.ts'
|
|
148
|
+
|
|
102
149
|
/**
|
|
103
150
|
* Stream capture results from a JSONL file.
|
|
104
151
|
*
|
|
105
152
|
* @remarks
|
|
106
153
|
* Memory-efficient alternative to loadResults for large files.
|
|
107
|
-
*
|
|
154
|
+
* Uses native streaming via Bun.file().stream() for O(1) memory usage.
|
|
108
155
|
*
|
|
109
156
|
* @param path - Path to the results.jsonl file
|
|
110
157
|
* @yields Parsed and validated capture results
|
|
@@ -113,20 +160,8 @@ export const loadJsonl = async <T = unknown>(path: string): Promise<T[]> => {
|
|
|
113
160
|
* @public
|
|
114
161
|
*/
|
|
115
162
|
export async function* streamResults(path: string): AsyncGenerator<CaptureResult, void, unknown> {
|
|
116
|
-
const
|
|
117
|
-
|
|
118
|
-
const lines = text.split('\n')
|
|
119
|
-
|
|
120
|
-
for (let i = 0; i < lines.length; i++) {
|
|
121
|
-
const line = lines[i]?.trim()
|
|
122
|
-
if (!line) continue
|
|
123
|
-
|
|
124
|
-
try {
|
|
125
|
-
yield CaptureResultSchema.parse(JSON.parse(line))
|
|
126
|
-
} catch (error) {
|
|
127
|
-
throw new Error(`Invalid result at line ${i + 1}: ${error instanceof Error ? error.message : error}`)
|
|
128
|
-
}
|
|
129
|
-
}
|
|
163
|
+
const { streamResultsNative } = await import('./streaming.ts')
|
|
164
|
+
yield* streamResultsNative(path)
|
|
130
165
|
}
|
|
131
166
|
|
|
132
167
|
/**
|
|
@@ -159,7 +194,7 @@ export const buildResultsIndex = async (path: string): Promise<Map<string, Captu
|
|
|
159
194
|
*
|
|
160
195
|
* @remarks
|
|
161
196
|
* Useful for detecting large files that should use streaming mode.
|
|
162
|
-
* Uses
|
|
197
|
+
* Uses native streaming for O(1) memory usage.
|
|
163
198
|
*
|
|
164
199
|
* @param path - Path to the JSONL file
|
|
165
200
|
* @returns Number of non-empty lines
|
|
@@ -167,7 +202,6 @@ export const buildResultsIndex = async (path: string): Promise<Map<string, Captu
|
|
|
167
202
|
* @public
|
|
168
203
|
*/
|
|
169
204
|
export const countLines = async (path: string): Promise<number> => {
|
|
170
|
-
const
|
|
171
|
-
|
|
172
|
-
return text.split('\n').filter((line) => line.trim()).length
|
|
205
|
+
const { countLinesStreaming } = await import('./streaming.ts')
|
|
206
|
+
return countLinesStreaming(path)
|
|
173
207
|
}
|