@tangle-network/agent-eval 0.20.7 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,126 +0,0 @@
1
- /**
2
- * GSM8K wrapper — exact-match grading on the final numeric answer.
3
- *
4
- * The dataset itself is NOT bundled. `loadDataset` will:
5
- * 1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
6
- * file with `{ id, question, answer }` records — the standard
7
- * HF mirror layout converted to JSONL);
8
- * 2. otherwise throw a clearly-marked error pointing to the loader.
9
- *
10
- * `evaluate` parses the final number out of the response (last
11
- * occurrence of a signed-decimal-or-integer literal, optionally after
12
- * `####`, the GSM8K answer convention) and compares to the ground-
13
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
14
- */
15
-
16
- import { existsSync, readFileSync } from 'node:fs'
17
-
18
- import type {
19
- BenchmarkAdapter,
20
- BenchmarkDatasetItem,
21
- BenchmarkEvaluation,
22
- } from '../../../src/benchmarks/types'
23
- import { deterministicSplit } from '../../../src/benchmarks/types'
24
- import type { RunSplitTag } from '../../../src/run-record'
25
-
26
- export interface Gsm8kPayload {
27
- question: string
28
- /** Reference answer, post-#### normalization. May be a number or
29
- * a numeric string ("72", "1.5"). */
30
- answer: string
31
- }
32
-
33
- export type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>
34
-
35
- class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
36
- async loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]> {
37
- const path = process.env.AGENT_EVAL_GSM8K_PATH
38
- if (!path) {
39
- throw new Error(
40
- 'GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file ' +
41
- 'with {id, question, answer} records (the HF GSM8K mirror converted to JSONL).',
42
- )
43
- }
44
- if (!existsSync(path)) {
45
- throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`)
46
- }
47
- const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split)
48
- return items
49
- }
50
-
51
- async evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation> {
52
- const expected = parseGsm8kAnswer(item.payload.answer)
53
- const observed = parseGsm8kAnswer(response)
54
- if (expected === null) {
55
- // Defensive: the dataset should never ship a non-numeric ref.
56
- return { score: 0, raw: { reason: 'reference_not_numeric', expected: item.payload.answer } }
57
- }
58
- if (observed === null) {
59
- return { score: 0, raw: { reason: 'no_numeric_in_response', expected, observed: null } }
60
- }
61
- const ok = Math.abs(expected - observed) < 1e-6
62
- return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } }
63
- }
64
-
65
- assignSplit(itemId: string): RunSplitTag {
66
- return assignSplitImpl(itemId)
67
- }
68
- }
69
-
70
- function assignSplitImpl(itemId: string): RunSplitTag {
71
- return deterministicSplit(`gsm8k::${itemId}`)
72
- }
73
-
74
- function parseJsonl(path: string): Gsm8kItem[] {
75
- const raw = readFileSync(path, 'utf8')
76
- const out: Gsm8kItem[] = []
77
- let lineNo = 0
78
- for (const line of raw.split('\n')) {
79
- lineNo++
80
- const trimmed = line.trim()
81
- if (!trimmed) continue
82
- let row: Record<string, unknown>
83
- try {
84
- row = JSON.parse(trimmed) as Record<string, unknown>
85
- } catch (e) {
86
- throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${(e as Error).message}`)
87
- }
88
- const id = String(row.id ?? `gsm8k_${lineNo}`)
89
- const question = String(row.question ?? '')
90
- const answer = String(row.answer ?? '')
91
- if (!question || !answer) {
92
- throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`)
93
- }
94
- out.push({ id, payload: { question, answer } })
95
- }
96
- return out
97
- }
98
-
99
- /**
100
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
101
- * convention (the canonical answer comes after `####`); otherwise
102
- * returns the LAST signed numeric literal in the string.
103
- */
104
- export function parseGsm8kAnswer(text: string): number | null {
105
- if (!text) return null
106
- const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/)
107
- if (afterMarker) {
108
- const cleaned = afterMarker[1]!.replace(/,/g, '')
109
- const v = Number(cleaned)
110
- if (Number.isFinite(v)) return v
111
- }
112
- // Last numeric literal anywhere in the string.
113
- const matches = text.match(/-?\d[\d,]*\.?\d*/g)
114
- if (!matches || matches.length === 0) return null
115
- const last = matches[matches.length - 1]!
116
- const cleaned = last.replace(/,/g, '')
117
- const v = Number(cleaned)
118
- return Number.isFinite(v) ? v : null
119
- }
120
-
121
- const adapter = new Gsm8kAdapter()
122
-
123
- export const loadDataset = adapter.loadDataset.bind(adapter)
124
- export const evaluate = adapter.evaluate.bind(adapter)
125
- export const assignSplit = adapter.assignSplit.bind(adapter)
126
- export { Gsm8kAdapter }
@@ -1,178 +0,0 @@
1
- /**
2
- * SWE-Bench Lite wrapper — 30-instance subset.
3
- *
4
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
5
- * is too heavy to ship inside this package. We expose the contract
6
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
7
- * own grader without touching call sites.
8
- *
9
- * Wire-up paths in priority order:
10
- *
11
- * 1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
12
- * lite instances + per-instance metadata (instance_id,
13
- * problem_statement, base_commit, repo, FAIL_TO_PASS,
14
- * PASS_TO_PASS).
15
- * 2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
16
- * that reads `{instance_id, patch}` JSON on stdin and writes
17
- * `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
18
- * JSON on stdout. Implementations can shell out to the
19
- * official `swebench` runner here.
20
- *
21
- * If neither is set, every public method throws a clearly-marked
22
- * "not implemented" error. The stub fails LOUD; it never silently
23
- * scores zero.
24
- */
25
-
26
- import { existsSync, readFileSync } from 'node:fs'
27
- import { spawn } from 'node:child_process'
28
-
29
- import type {
30
- BenchmarkAdapter,
31
- BenchmarkDatasetItem,
32
- BenchmarkEvaluation,
33
- } from '../../../src/benchmarks/types'
34
- import { deterministicSplit } from '../../../src/benchmarks/types'
35
- import type { RunSplitTag } from '../../../src/run-record'
36
-
37
- export interface SweBenchLitePayload {
38
- instanceId: string
39
- problemStatement: string
40
- baseCommit: string
41
- repo: string
42
- failToPass: string[]
43
- passToPass: string[]
44
- }
45
-
46
- export type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>
47
-
48
- class SweBenchLiteAdapter
49
- implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload>
50
- {
51
- async loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]> {
52
- const path = process.env.AGENT_EVAL_SWEBENCH_PATH
53
- if (!path) {
54
- throw new Error(
55
- 'SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file ' +
56
- 'with the 30 lite instances. STUB: this wrapper does not bundle the dataset; ' +
57
- 'see https://www.swebench.com/lite.html for the canonical source.',
58
- )
59
- }
60
- if (!existsSync(path)) {
61
- throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`)
62
- }
63
- const all = parseJsonl(path)
64
- return all.filter((it) => assignSplitImpl(it.id) === split)
65
- }
66
-
67
- async evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation> {
68
- const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD
69
- if (!cmd) {
70
- throw new Error(
71
- 'SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an ' +
72
- 'executable that reads {instance_id, patch} JSON on stdin and writes ' +
73
- '{passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. ' +
74
- 'TODO(swebench-lite): bundle a default Docker-based runner once the SDK ' +
75
- 'stabilises (https://github.com/swe-bench/SWE-bench).',
76
- )
77
- }
78
- const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response })
79
- const result = await runGrader(cmd, stdinPayload)
80
- let parsed: Record<string, unknown>
81
- try {
82
- parsed = JSON.parse(result.stdout) as Record<string, unknown>
83
- } catch (e) {
84
- throw new Error(
85
- `SWE-Bench grader emitted non-JSON stdout: ${(e as Error).message}\n` +
86
- `stdout=${result.stdout.slice(0, 400)}\nstderr=${result.stderr.slice(0, 400)}`,
87
- )
88
- }
89
- const passed = Boolean(parsed.passed)
90
- return {
91
- score: passed ? 1 : 0,
92
- raw: {
93
- passed,
94
- failToPassPassed: Boolean(parsed.fail_to_pass_passed),
95
- passToPassPassed: Boolean(parsed.pass_to_pass_passed),
96
- graderLog: typeof parsed.log === 'string' ? parsed.log.slice(0, 4000) : '',
97
- },
98
- }
99
- }
100
-
101
- assignSplit(itemId: string): RunSplitTag {
102
- return assignSplitImpl(itemId)
103
- }
104
- }
105
-
106
- function assignSplitImpl(itemId: string): RunSplitTag {
107
- return deterministicSplit(`swebench-lite::${itemId}`)
108
- }
109
-
110
- function parseJsonl(path: string): SweBenchLiteItem[] {
111
- const raw = readFileSync(path, 'utf8')
112
- const out: SweBenchLiteItem[] = []
113
- let lineNo = 0
114
- for (const line of raw.split('\n')) {
115
- lineNo++
116
- const trimmed = line.trim()
117
- if (!trimmed) continue
118
- const row = JSON.parse(trimmed) as Record<string, unknown>
119
- const instanceId = String(row.instance_id ?? row.instanceId ?? '')
120
- if (!instanceId) {
121
- throw new Error(`swebench-lite line ${lineNo} missing instance_id`)
122
- }
123
- out.push({
124
- id: instanceId,
125
- payload: {
126
- instanceId,
127
- problemStatement: String(row.problem_statement ?? row.problemStatement ?? ''),
128
- baseCommit: String(row.base_commit ?? row.baseCommit ?? ''),
129
- repo: String(row.repo ?? ''),
130
- failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
131
- passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass),
132
- },
133
- })
134
- }
135
- return out
136
- }
137
-
138
- function asStringArray(v: unknown): string[] {
139
- if (Array.isArray(v)) return v.filter((x): x is string => typeof x === 'string')
140
- if (typeof v === 'string') {
141
- try {
142
- const parsed = JSON.parse(v)
143
- if (Array.isArray(parsed)) return parsed.filter((x): x is string => typeof x === 'string')
144
- } catch {
145
- // Plain string; treat as a single-element list.
146
- return [v]
147
- }
148
- }
149
- return []
150
- }
151
-
152
- function runGrader(cmd: string, stdin: string): Promise<{ stdout: string; stderr: string }> {
153
- return new Promise((resolve, reject) => {
154
- const parts = cmd.split(/\s+/)
155
- const child = spawn(parts[0]!, parts.slice(1), { stdio: ['pipe', 'pipe', 'pipe'] })
156
- let stdout = ''
157
- let stderr = ''
158
- child.stdout.on('data', (b: Buffer) => (stdout += b.toString('utf8')))
159
- child.stderr.on('data', (b: Buffer) => (stderr += b.toString('utf8')))
160
- child.on('error', reject)
161
- child.on('close', (code) => {
162
- if (code !== 0) {
163
- reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`))
164
- return
165
- }
166
- resolve({ stdout, stderr })
167
- })
168
- child.stdin.write(stdin)
169
- child.stdin.end()
170
- })
171
- }
172
-
173
- const adapter = new SweBenchLiteAdapter()
174
-
175
- export const loadDataset = adapter.loadDataset.bind(adapter)
176
- export const evaluate = adapter.evaluate.bind(adapter)
177
- export const assignSplit = adapter.assignSplit.bind(adapter)
178
- export { SweBenchLiteAdapter }
@@ -1,114 +0,0 @@
1
- import {
2
- runMultiShotOptimization,
3
- trialTraceFromMultiShotTrial,
4
- type MultiShotVariant,
5
- type RunRecord,
6
- } from '@tangle-network/agent-eval'
7
-
8
- type Payload = {
9
- instruction: string
10
- quality: number
11
- }
12
-
13
- const baseline: MultiShotVariant<Payload> = {
14
- id: 'baseline',
15
- label: 'baseline',
16
- generation: 0,
17
- payload: {
18
- instruction: 'Complete the user task.',
19
- quality: 0.45,
20
- },
21
- }
22
-
23
- const result = await runMultiShotOptimization<Payload>({
24
- runId: 'demo-multi-shot',
25
- target: 'demo-agent-system-prompt',
26
- seedVariants: [baseline],
27
- searchScenarioIds: ['search-brief', 'search-code-review', 'search-research'],
28
- reps: 1,
29
- generations: 2,
30
- populationSize: 2,
31
- scoreConcurrency: 2,
32
- runner: {
33
- async run({ variant, scenarioId }) {
34
- return {
35
- trace: {
36
- scenarioId,
37
- turns: [
38
- { role: 'user', content: `Run ${scenarioId}` },
39
- { role: 'assistant', content: `${variant.payload.instruction} quality=${variant.payload.quality}` },
40
- ],
41
- output: `quality=${variant.payload.quality}`,
42
- },
43
- costUsd: 0.01,
44
- durationMs: 50,
45
- }
46
- },
47
- },
48
- scorer: {
49
- async score({ variant }) {
50
- return {
51
- score: variant.payload.quality,
52
- ok: true,
53
- asi: variant.payload.quality >= 0.8
54
- ? []
55
- : [{
56
- expectationId: 'complete-task',
57
- message: 'The agent did not fully complete the task.',
58
- severity: 'error',
59
- responsibleSurface: 'system-prompt',
60
- suggestion: 'Make completion criteria explicit before final response.',
61
- }],
62
- }
63
- },
64
- },
65
- mutateAdapter: {
66
- async mutate({ parent, bottomTrials, childCount, generation }) {
67
- const traces = bottomTrials.map((trial) => trialTraceFromMultiShotTrial(trial))
68
- const rationale = traces.flatMap((trace) => (trace.expectations ?? []).map((e) => e.phrase)).join('\n')
69
- return Array.from({ length: childCount }, (_, i) => ({
70
- id: `${parent.id}.g${generation}.${i}`,
71
- label: 'completion-focused',
72
- generation,
73
- payload: {
74
- instruction: `${parent.payload.instruction} Verify every requested step before final answer.`,
75
- quality: 0.9,
76
- },
77
- rationale,
78
- }))
79
- },
80
- },
81
- gate: {
82
- holdoutScenarioIds: ['holdout-brief', 'holdout-code-review', 'holdout-research'],
83
- gate: {
84
- baselineKey: 'baseline',
85
- minProductiveRuns: 3,
86
- pairedDeltaThreshold: 0,
87
- seed: 7,
88
- },
89
- toRunRecord: ({ variant, scenarioId, rep, split, seed, trial }): RunRecord => ({
90
- runId: `demo-${variant.id}-${scenarioId}-${rep}-${split}`,
91
- experimentId: scenarioId,
92
- candidateId: variant.id,
93
- seed,
94
- model: 'demo-model@2026-01-01',
95
- promptHash: 'p'.repeat(64),
96
- configHash: 'c'.repeat(64),
97
- commitSha: 'deadbeef',
98
- wallMs: trial.durationMs ?? 0,
99
- costUsd: trial.cost ?? 0,
100
- tokenUsage: { input: 1, output: 1 },
101
- outcome: {
102
- [split === 'holdout' ? 'holdoutScore' : 'searchScore']: trial.score,
103
- raw: { score: trial.score },
104
- },
105
- splitTag: split,
106
- }),
107
- },
108
- })
109
-
110
- console.log({
111
- searchBest: result.searchBestVariant.id,
112
- promoted: result.promotedVariant.id,
113
- gate: result.gate?.decision ?? null,
114
- })
@@ -1,63 +0,0 @@
1
- import {
2
- InMemoryTraceStore,
3
- SandboxHarness,
4
- SubprocessSandboxDriver,
5
- TraceEmitter,
6
- } from '@tangle-network/agent-eval'
7
-
8
- /**
9
- * Same-sandbox pattern:
10
- * - one driver owns one workdir
11
- * - the harness runs setup/build/test there
12
- * - later checks can inspect files/logs/screenshots produced by those phases
13
- *
14
- * Replace `workdir` with a generated app, browser automation checkout, or
15
- * remote computer-use workspace.
16
- */
17
- export async function runSameSandboxExample(workdir: string) {
18
- const store = new InMemoryTraceStore()
19
- const driver = new SubprocessSandboxDriver({ cwd: workdir })
20
- const harness = new SandboxHarness(driver)
21
- const emitter = new TraceEmitter(store)
22
- await emitter.startRun({
23
- scenarioId: 'same-sandbox-example',
24
- layer: 'app-build',
25
- })
26
-
27
- const result = await harness.run({
28
- setupCommand: 'pnpm install --frozen-lockfile',
29
- runCommand: 'pnpm build',
30
- testCommand: 'pnpm test',
31
- timeoutMs: 180_000,
32
- }, emitter)
33
-
34
- const summary = [
35
- `passed=${result.passed}`,
36
- `score=${result.score}`,
37
- `build=${result.run?.exitCode ?? 'not-run'}`,
38
- `test=${result.test?.exitCode ?? 'not-run'}`,
39
- result.test?.stdout?.slice(-2000) ?? '',
40
- ].join('\n')
41
-
42
- const judged = {
43
- score: result.passed && summary.includes('test=0') ? 1 : 0,
44
- rationale: result.passed
45
- ? 'Shared sandbox produced passing build/test evidence.'
46
- : 'Shared sandbox did not produce passing build/test evidence.',
47
- }
48
- await emitter.recordJudge({
49
- judgeId: 'same-sandbox-evidence',
50
- name: 'same-sandbox-evidence',
51
- dimension: 'evidence',
52
- score: judged.score,
53
- rationale: judged.rationale,
54
- evidence: summary,
55
- })
56
- await emitter.endRun({
57
- pass: result.passed,
58
- score: result.score,
59
- notes: judged.rationale,
60
- })
61
-
62
- return { result, judged, traces: await store.listRuns() }
63
- }