npm - @tangle-network/agent-eval - Versions diffs - 0.20.8 → 0.20.10 - Mend

@tangle-network/agent-eval 0.20.8 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/CHANGELOG.md +302 -0
package/LICENSE +21 -0
package/README.md +16 -9
package/dist/benchmarks/index.d.ts +1 -0
package/dist/benchmarks/index.js +12 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/chunk-42I2QC2L.js +219 -0
package/dist/chunk-42I2QC2L.js.map +1 -0
package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} +90 -11
package/dist/chunk-LSR4IAYN.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/index-1PZOtZFr.d.ts +290 -0
package/dist/index.d.ts +37 -298
package/dist/index.js +130 -252
package/dist/index.js.map +1 -1
package/dist/openapi.json +502 -0
package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} +1 -1
package/dist/telemetry/file.d.ts +1 -1
package/dist/telemetry/index.d.ts +2 -2
package/dist/telemetry/index.js.map +1 -1
package/dist/wire/index.js +1 -1
package/docs/concepts.md +4 -4
package/docs/knowledge-readiness.md +2 -2
package/docs/wire-protocol.md +3 -3
package/package.json +13 -5
package/dist/chunk-CJJSB6ZQ.js.map +0 -1
package/examples/benchmarks/README.md +0 -44
package/examples/benchmarks/gsm8k/index.ts +0 -126
package/examples/benchmarks/swebench-lite/index.ts +0 -178
package/examples/multi-shot-optimization/index.ts +0 -114
package/examples/same-sandbox-harness/index.ts +0 -63

package/examples/same-sandbox-harness/index.ts DELETED Viewed

@@ -1,63 +0,0 @@
-import {
-  InMemoryTraceStore,
-  SandboxHarness,
-  SubprocessSandboxDriver,
-  TraceEmitter,
-} from '@tangle-network/agent-eval'
-/**
- * Same-sandbox pattern:
- * - one driver owns one workdir
- * - the harness runs setup/build/test there
- * - later checks can inspect files/logs/screenshots produced by those phases
- *
- * Replace `workdir` with a generated app, browser automation checkout, or
- * remote computer-use workspace.
- */
-export async function runSameSandboxExample(workdir: string) {
-  const store = new InMemoryTraceStore()
-  const driver = new SubprocessSandboxDriver({ cwd: workdir })
-  const harness = new SandboxHarness(driver)
-  const emitter = new TraceEmitter(store)
-  await emitter.startRun({
-    scenarioId: 'same-sandbox-example',
-    layer: 'app-build',
-  })
-  const result = await harness.run({
-    setupCommand: 'pnpm install --frozen-lockfile',
-    runCommand: 'pnpm build',
-    testCommand: 'pnpm test',
-    timeoutMs: 180_000,
-  }, emitter)
-  const summary = [
-    `passed=${result.passed}`,
-    `score=${result.score}`,
-    `build=${result.run?.exitCode ?? 'not-run'}`,
-    `test=${result.test?.exitCode ?? 'not-run'}`,
-    result.test?.stdout?.slice(-2000) ?? '',
-  ].join('\n')
-  const judged = {
-    score: result.passed && summary.includes('test=0') ? 1 : 0,
-    rationale: result.passed
-      ? 'Shared sandbox produced passing build/test evidence.'
-      : 'Shared sandbox did not produce passing build/test evidence.',
-  }
-  await emitter.recordJudge({
-    judgeId: 'same-sandbox-evidence',
-    name: 'same-sandbox-evidence',
-    dimension: 'evidence',
-    score: judged.score,
-    rationale: judged.rationale,
-    evidence: summary,
-  })
-  await emitter.endRun({
-    pass: result.passed,
-    score: result.score,
-    notes: judged.rationale,
-  })
-  return { result, judged, traces: await store.listRuns() }
-}