@tangle-network/agent-eval 0.20.8 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,63 +0,0 @@
1
- import {
2
- InMemoryTraceStore,
3
- SandboxHarness,
4
- SubprocessSandboxDriver,
5
- TraceEmitter,
6
- } from '@tangle-network/agent-eval'
7
-
8
- /**
9
- * Same-sandbox pattern:
10
- * - one driver owns one workdir
11
- * - the harness runs setup/build/test there
12
- * - later checks can inspect files/logs/screenshots produced by those phases
13
- *
14
- * Replace `workdir` with a generated app, browser automation checkout, or
15
- * remote computer-use workspace.
16
- */
17
- export async function runSameSandboxExample(workdir: string) {
18
- const store = new InMemoryTraceStore()
19
- const driver = new SubprocessSandboxDriver({ cwd: workdir })
20
- const harness = new SandboxHarness(driver)
21
- const emitter = new TraceEmitter(store)
22
- await emitter.startRun({
23
- scenarioId: 'same-sandbox-example',
24
- layer: 'app-build',
25
- })
26
-
27
- const result = await harness.run({
28
- setupCommand: 'pnpm install --frozen-lockfile',
29
- runCommand: 'pnpm build',
30
- testCommand: 'pnpm test',
31
- timeoutMs: 180_000,
32
- }, emitter)
33
-
34
- const summary = [
35
- `passed=${result.passed}`,
36
- `score=${result.score}`,
37
- `build=${result.run?.exitCode ?? 'not-run'}`,
38
- `test=${result.test?.exitCode ?? 'not-run'}`,
39
- result.test?.stdout?.slice(-2000) ?? '',
40
- ].join('\n')
41
-
42
- const judged = {
43
- score: result.passed && summary.includes('test=0') ? 1 : 0,
44
- rationale: result.passed
45
- ? 'Shared sandbox produced passing build/test evidence.'
46
- : 'Shared sandbox did not produce passing build/test evidence.',
47
- }
48
- await emitter.recordJudge({
49
- judgeId: 'same-sandbox-evidence',
50
- name: 'same-sandbox-evidence',
51
- dimension: 'evidence',
52
- score: judged.score,
53
- rationale: judged.rationale,
54
- evidence: summary,
55
- })
56
- await emitter.endRun({
57
- pass: result.passed,
58
- score: result.score,
59
- notes: judged.rationale,
60
- })
61
-
62
- return { result, judged, traces: await store.listRuns() }
63
- }