npm - @tangle-network/agent-eval - Versions diffs - 0.20.7 → 0.20.9 - Mend

@tangle-network/agent-eval 0.20.7 → 0.20.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/LICENSE +21 -0
package/README.md +9 -6
package/dist/benchmarks/index.d.ts +1 -0
package/dist/benchmarks/index.js +12 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/chunk-XDGJUIV2.js +219 -0
package/dist/chunk-XDGJUIV2.js.map +1 -0
package/dist/index-CEWY1rmu.d.ts +290 -0
package/dist/index.d.ts +61 -298
package/dist/index.js +139 -248
package/dist/index.js.map +1 -1
package/dist/openapi.json +477 -0
package/docs/concepts.md +4 -4
package/docs/knowledge-readiness.md +2 -2
package/docs/wire-protocol.md +3 -3
package/package.json +14 -7
package/examples/benchmarks/README.md +0 -44
package/examples/benchmarks/gsm8k/index.ts +0 -126
package/examples/benchmarks/swebench-lite/index.ts +0 -178
package/examples/multi-shot-optimization/index.ts +0 -114
package/examples/same-sandbox-harness/index.ts +0 -63

package/examples/benchmarks/gsm8k/index.ts DELETED Viewed

@@ -1,126 +0,0 @@
-/**
- * GSM8K wrapper — exact-match grading on the final numeric answer.
- *
- * The dataset itself is NOT bundled. `loadDataset` will:
- *   1. read from `process.env.AGENT_EVAL_GSM8K_PATH` if set (a JSONL
- *      file with `{ id, question, answer }` records — the standard
- *      HF mirror layout converted to JSONL);
- *   2. otherwise throw a clearly-marked error pointing to the loader.
- *
- * `evaluate` parses the final number out of the response (last
- * occurrence of a signed-decimal-or-integer literal, optionally after
- * `####`, the GSM8K answer convention) and compares to the ground-
- * truth integer. Floating-point comparisons use a 1e-6 tolerance.
- */
-import { existsSync, readFileSync } from 'node:fs'
-import type {
-  BenchmarkAdapter,
-  BenchmarkDatasetItem,
-  BenchmarkEvaluation,
-} from '../../../src/benchmarks/types'
-import { deterministicSplit } from '../../../src/benchmarks/types'
-import type { RunSplitTag } from '../../../src/run-record'
-export interface Gsm8kPayload {
-  question: string
-  /** Reference answer, post-#### normalization. May be a number or
-   *  a numeric string ("72", "1.5"). */
-  answer: string
-}
-export type Gsm8kItem = BenchmarkDatasetItem<Gsm8kPayload>
-class Gsm8kAdapter implements BenchmarkAdapter<Gsm8kItem, Gsm8kPayload> {
-  async loadDataset(split: RunSplitTag): Promise<Gsm8kItem[]> {
-    const path = process.env.AGENT_EVAL_GSM8K_PATH
-    if (!path) {
-      throw new Error(
-        'GSM8K dataset not provided. Set AGENT_EVAL_GSM8K_PATH to a JSONL file ' +
-          'with {id, question, answer} records (the HF GSM8K mirror converted to JSONL).',
-      )
-    }
-    if (!existsSync(path)) {
-      throw new Error(`AGENT_EVAL_GSM8K_PATH=${path} does not exist`)
-    }
-    const items = parseJsonl(path).filter((it) => assignSplitImpl(it.id) === split)
-    return items
-  }
-  async evaluate(item: Gsm8kItem, response: string): Promise<BenchmarkEvaluation> {
-    const expected = parseGsm8kAnswer(item.payload.answer)
-    const observed = parseGsm8kAnswer(response)
-    if (expected === null) {
-      // Defensive: the dataset should never ship a non-numeric ref.
-      return { score: 0, raw: { reason: 'reference_not_numeric', expected: item.payload.answer } }
-    }
-    if (observed === null) {
-      return { score: 0, raw: { reason: 'no_numeric_in_response', expected, observed: null } }
-    }
-    const ok = Math.abs(expected - observed) < 1e-6
-    return { score: ok ? 1 : 0, raw: { expected, observed, exactMatch: ok } }
-  }
-  assignSplit(itemId: string): RunSplitTag {
-    return assignSplitImpl(itemId)
-  }
-}
-function assignSplitImpl(itemId: string): RunSplitTag {
-  return deterministicSplit(`gsm8k::${itemId}`)
-}
-function parseJsonl(path: string): Gsm8kItem[] {
-  const raw = readFileSync(path, 'utf8')
-  const out: Gsm8kItem[] = []
-  let lineNo = 0
-  for (const line of raw.split('\n')) {
-    lineNo++
-    const trimmed = line.trim()
-    if (!trimmed) continue
-    let row: Record<string, unknown>
-    try {
-      row = JSON.parse(trimmed) as Record<string, unknown>
-    } catch (e) {
-      throw new Error(`GSM8K JSONL parse error at line ${lineNo}: ${(e as Error).message}`)
-    }
-    const id = String(row.id ?? `gsm8k_${lineNo}`)
-    const question = String(row.question ?? '')
-    const answer = String(row.answer ?? '')
-    if (!question || !answer) {
-      throw new Error(`GSM8K JSONL line ${lineNo} missing question/answer`)
-    }
-    out.push({ id, payload: { question, answer } })
-  }
-  return out
-}
-/**
- * Parse a GSM8K-style answer. Honors the dataset's `#### N`
- * convention (the canonical answer comes after `####`); otherwise
- * returns the LAST signed numeric literal in the string.
- */
-export function parseGsm8kAnswer(text: string): number | null {
-  if (!text) return null
-  const afterMarker = text.match(/####\s*(-?\d[\d,]*\.?\d*)/)
-  if (afterMarker) {
-    const cleaned = afterMarker[1]!.replace(/,/g, '')
-    const v = Number(cleaned)
-    if (Number.isFinite(v)) return v
-  }
-  // Last numeric literal anywhere in the string.
-  const matches = text.match(/-?\d[\d,]*\.?\d*/g)
-  if (!matches || matches.length === 0) return null
-  const last = matches[matches.length - 1]!
-  const cleaned = last.replace(/,/g, '')
-  const v = Number(cleaned)
-  return Number.isFinite(v) ? v : null
-}
-const adapter = new Gsm8kAdapter()
-export const loadDataset = adapter.loadDataset.bind(adapter)
-export const evaluate = adapter.evaluate.bind(adapter)
-export const assignSplit = adapter.assignSplit.bind(adapter)
-export { Gsm8kAdapter }

package/examples/benchmarks/swebench-lite/index.ts DELETED Viewed

@@ -1,178 +0,0 @@
-/**
- * SWE-Bench Lite wrapper — 30-instance subset.
- *
- * Status: STUB. The actual SWE-Bench harness needs a Docker host and
- * is too heavy to ship inside this package. We expose the contract
- * (loadDataset, evaluate, assignSplit) so consumers can plug in their
- * own grader without touching call sites.
- *
- * Wire-up paths in priority order:
- *
- *   1. `process.env.AGENT_EVAL_SWEBENCH_PATH` → JSONL with the 30
- *      lite instances + per-instance metadata (instance_id,
- *      problem_statement, base_commit, repo, FAIL_TO_PASS,
- *      PASS_TO_PASS).
- *   2. `process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD` → executable
- *      that reads `{instance_id, patch}` JSON on stdin and writes
- *      `{passed, fail_to_pass_passed, pass_to_pass_passed, log}`
- *      JSON on stdout. Implementations can shell out to the
- *      official `swebench` runner here.
- *
- * If neither is set, every public method throws a clearly-marked
- * "not implemented" error. The stub fails LOUD; it never silently
- * scores zero.
- */
-import { existsSync, readFileSync } from 'node:fs'
-import { spawn } from 'node:child_process'
-import type {
-  BenchmarkAdapter,
-  BenchmarkDatasetItem,
-  BenchmarkEvaluation,
-} from '../../../src/benchmarks/types'
-import { deterministicSplit } from '../../../src/benchmarks/types'
-import type { RunSplitTag } from '../../../src/run-record'
-export interface SweBenchLitePayload {
-  instanceId: string
-  problemStatement: string
-  baseCommit: string
-  repo: string
-  failToPass: string[]
-  passToPass: string[]
-}
-export type SweBenchLiteItem = BenchmarkDatasetItem<SweBenchLitePayload>
-class SweBenchLiteAdapter
-  implements BenchmarkAdapter<SweBenchLiteItem, SweBenchLitePayload>
-{
-  async loadDataset(split: RunSplitTag): Promise<SweBenchLiteItem[]> {
-    const path = process.env.AGENT_EVAL_SWEBENCH_PATH
-    if (!path) {
-      throw new Error(
-        'SWE-Bench Lite dataset not provided. Set AGENT_EVAL_SWEBENCH_PATH to a JSONL file ' +
-          'with the 30 lite instances. STUB: this wrapper does not bundle the dataset; ' +
-          'see https://www.swebench.com/lite.html for the canonical source.',
-      )
-    }
-    if (!existsSync(path)) {
-      throw new Error(`AGENT_EVAL_SWEBENCH_PATH=${path} does not exist`)
-    }
-    const all = parseJsonl(path)
-    return all.filter((it) => assignSplitImpl(it.id) === split)
-  }
-  async evaluate(item: SweBenchLiteItem, response: string): Promise<BenchmarkEvaluation> {
-    const cmd = process.env.AGENT_EVAL_SWEBENCH_GRADER_CMD
-    if (!cmd) {
-      throw new Error(
-        'SWE-Bench Lite grader not configured. Set AGENT_EVAL_SWEBENCH_GRADER_CMD to an ' +
-          'executable that reads {instance_id, patch} JSON on stdin and writes ' +
-          '{passed, fail_to_pass_passed, pass_to_pass_passed, log} JSON on stdout. ' +
-          'TODO(swebench-lite): bundle a default Docker-based runner once the SDK ' +
-          'stabilises (https://github.com/swe-bench/SWE-bench).',
-      )
-    }
-    const stdinPayload = JSON.stringify({ instance_id: item.payload.instanceId, patch: response })
-    const result = await runGrader(cmd, stdinPayload)
-    let parsed: Record<string, unknown>
-    try {
-      parsed = JSON.parse(result.stdout) as Record<string, unknown>
-    } catch (e) {
-      throw new Error(
-        `SWE-Bench grader emitted non-JSON stdout: ${(e as Error).message}\n` +
-          `stdout=${result.stdout.slice(0, 400)}\nstderr=${result.stderr.slice(0, 400)}`,
-      )
-    }
-    const passed = Boolean(parsed.passed)
-    return {
-      score: passed ? 1 : 0,
-      raw: {
-        passed,
-        failToPassPassed: Boolean(parsed.fail_to_pass_passed),
-        passToPassPassed: Boolean(parsed.pass_to_pass_passed),
-        graderLog: typeof parsed.log === 'string' ? parsed.log.slice(0, 4000) : '',
-      },
-    }
-  }
-  assignSplit(itemId: string): RunSplitTag {
-    return assignSplitImpl(itemId)
-  }
-}
-function assignSplitImpl(itemId: string): RunSplitTag {
-  return deterministicSplit(`swebench-lite::${itemId}`)
-}
-function parseJsonl(path: string): SweBenchLiteItem[] {
-  const raw = readFileSync(path, 'utf8')
-  const out: SweBenchLiteItem[] = []
-  let lineNo = 0
-  for (const line of raw.split('\n')) {
-    lineNo++
-    const trimmed = line.trim()
-    if (!trimmed) continue
-    const row = JSON.parse(trimmed) as Record<string, unknown>
-    const instanceId = String(row.instance_id ?? row.instanceId ?? '')
-    if (!instanceId) {
-      throw new Error(`swebench-lite line ${lineNo} missing instance_id`)
-    }
-    out.push({
-      id: instanceId,
-      payload: {
-        instanceId,
-        problemStatement: String(row.problem_statement ?? row.problemStatement ?? ''),
-        baseCommit: String(row.base_commit ?? row.baseCommit ?? ''),
-        repo: String(row.repo ?? ''),
-        failToPass: asStringArray(row.FAIL_TO_PASS ?? row.failToPass),
-        passToPass: asStringArray(row.PASS_TO_PASS ?? row.passToPass),
-      },
-    })
-  }
-  return out
-}
-function asStringArray(v: unknown): string[] {
-  if (Array.isArray(v)) return v.filter((x): x is string => typeof x === 'string')
-  if (typeof v === 'string') {
-    try {
-      const parsed = JSON.parse(v)
-      if (Array.isArray(parsed)) return parsed.filter((x): x is string => typeof x === 'string')
-    } catch {
-      // Plain string; treat as a single-element list.
-      return [v]
-    }
-  }
-  return []
-}
-function runGrader(cmd: string, stdin: string): Promise<{ stdout: string; stderr: string }> {
-  return new Promise((resolve, reject) => {
-    const parts = cmd.split(/\s+/)
-    const child = spawn(parts[0]!, parts.slice(1), { stdio: ['pipe', 'pipe', 'pipe'] })
-    let stdout = ''
-    let stderr = ''
-    child.stdout.on('data', (b: Buffer) => (stdout += b.toString('utf8')))
-    child.stderr.on('data', (b: Buffer) => (stderr += b.toString('utf8')))
-    child.on('error', reject)
-    child.on('close', (code) => {
-      if (code !== 0) {
-        reject(new Error(`grader exited with code ${code}: ${stderr.slice(0, 400)}`))
-        return
-      }
-      resolve({ stdout, stderr })
-    })
-    child.stdin.write(stdin)
-    child.stdin.end()
-  })
-}
-const adapter = new SweBenchLiteAdapter()
-export const loadDataset = adapter.loadDataset.bind(adapter)
-export const evaluate = adapter.evaluate.bind(adapter)
-export const assignSplit = adapter.assignSplit.bind(adapter)
-export { SweBenchLiteAdapter }

package/examples/multi-shot-optimization/index.ts DELETED Viewed

@@ -1,114 +0,0 @@
-import {
-  runMultiShotOptimization,
-  trialTraceFromMultiShotTrial,
-  type MultiShotVariant,
-  type RunRecord,
-} from '@tangle-network/agent-eval'
-type Payload = {
-  instruction: string
-  quality: number
-}
-const baseline: MultiShotVariant<Payload> = {
-  id: 'baseline',
-  label: 'baseline',
-  generation: 0,
-  payload: {
-    instruction: 'Complete the user task.',
-    quality: 0.45,
-  },
-}
-const result = await runMultiShotOptimization<Payload>({
-  runId: 'demo-multi-shot',
-  target: 'demo-agent-system-prompt',
-  seedVariants: [baseline],
-  searchScenarioIds: ['search-brief', 'search-code-review', 'search-research'],
-  reps: 1,
-  generations: 2,
-  populationSize: 2,
-  scoreConcurrency: 2,
-  runner: {
-    async run({ variant, scenarioId }) {
-      return {
-        trace: {
-          scenarioId,
-          turns: [
-            { role: 'user', content: `Run ${scenarioId}` },
-            { role: 'assistant', content: `${variant.payload.instruction} quality=${variant.payload.quality}` },
-          ],
-          output: `quality=${variant.payload.quality}`,
-        },
-        costUsd: 0.01,
-        durationMs: 50,
-      }
-    },
-  },
-  scorer: {
-    async score({ variant }) {
-      return {
-        score: variant.payload.quality,
-        ok: true,
-        asi: variant.payload.quality >= 0.8
-          ? []
-          : [{
-              expectationId: 'complete-task',
-              message: 'The agent did not fully complete the task.',
-              severity: 'error',
-              responsibleSurface: 'system-prompt',
-              suggestion: 'Make completion criteria explicit before final response.',
-            }],
-      }
-    },
-  },
-  mutateAdapter: {
-    async mutate({ parent, bottomTrials, childCount, generation }) {
-      const traces = bottomTrials.map((trial) => trialTraceFromMultiShotTrial(trial))
-      const rationale = traces.flatMap((trace) => (trace.expectations ?? []).map((e) => e.phrase)).join('\n')
-      return Array.from({ length: childCount }, (_, i) => ({
-        id: `${parent.id}.g${generation}.${i}`,
-        label: 'completion-focused',
-        generation,
-        payload: {
-          instruction: `${parent.payload.instruction} Verify every requested step before final answer.`,
-          quality: 0.9,
-        },
-        rationale,
-      }))
-    },
-  },
-  gate: {
-    holdoutScenarioIds: ['holdout-brief', 'holdout-code-review', 'holdout-research'],
-    gate: {
-      baselineKey: 'baseline',
-      minProductiveRuns: 3,
-      pairedDeltaThreshold: 0,
-      seed: 7,
-    },
-    toRunRecord: ({ variant, scenarioId, rep, split, seed, trial }): RunRecord => ({
-      runId: `demo-${variant.id}-${scenarioId}-${rep}-${split}`,
-      experimentId: scenarioId,
-      candidateId: variant.id,
-      seed,
-      model: 'demo-model@2026-01-01',
-      promptHash: 'p'.repeat(64),
-      configHash: 'c'.repeat(64),
-      commitSha: 'deadbeef',
-      wallMs: trial.durationMs ?? 0,
-      costUsd: trial.cost ?? 0,
-      tokenUsage: { input: 1, output: 1 },
-      outcome: {
-        [split === 'holdout' ? 'holdoutScore' : 'searchScore']: trial.score,
-        raw: { score: trial.score },
-      },
-      splitTag: split,
-    }),
-  },
-})
-console.log({
-  searchBest: result.searchBestVariant.id,
-  promoted: result.promotedVariant.id,
-  gate: result.gate?.decision ?? null,
-})

package/examples/same-sandbox-harness/index.ts DELETED Viewed

@@ -1,63 +0,0 @@
-import {
-  InMemoryTraceStore,
-  SandboxHarness,
-  SubprocessSandboxDriver,
-  TraceEmitter,
-} from '@tangle-network/agent-eval'
-/**
- * Same-sandbox pattern:
- * - one driver owns one workdir
- * - the harness runs setup/build/test there
- * - later checks can inspect files/logs/screenshots produced by those phases
- *
- * Replace `workdir` with a generated app, browser automation checkout, or
- * remote computer-use workspace.
- */
-export async function runSameSandboxExample(workdir: string) {
-  const store = new InMemoryTraceStore()
-  const driver = new SubprocessSandboxDriver({ cwd: workdir })
-  const harness = new SandboxHarness(driver)
-  const emitter = new TraceEmitter(store)
-  await emitter.startRun({
-    scenarioId: 'same-sandbox-example',
-    layer: 'app-build',
-  })
-  const result = await harness.run({
-    setupCommand: 'pnpm install --frozen-lockfile',
-    runCommand: 'pnpm build',
-    testCommand: 'pnpm test',
-    timeoutMs: 180_000,
-  }, emitter)
-  const summary = [
-    `passed=${result.passed}`,
-    `score=${result.score}`,
-    `build=${result.run?.exitCode ?? 'not-run'}`,
-    `test=${result.test?.exitCode ?? 'not-run'}`,
-    result.test?.stdout?.slice(-2000) ?? '',
-  ].join('\n')
-  const judged = {
-    score: result.passed && summary.includes('test=0') ? 1 : 0,
-    rationale: result.passed
-      ? 'Shared sandbox produced passing build/test evidence.'
-      : 'Shared sandbox did not produce passing build/test evidence.',
-  }
-  await emitter.recordJudge({
-    judgeId: 'same-sandbox-evidence',
-    name: 'same-sandbox-evidence',
-    dimension: 'evidence',
-    score: judged.score,
-    rationale: judged.rationale,
-    evidence: summary,
-  })
-  await emitter.endRun({
-    pass: result.passed,
-    score: result.score,
-    notes: judged.rationale,
-  })
-  return { result, judged, traces: await store.listRuns() }
-}