@theokit/sdk 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +22 -0
- package/README.md +4 -0
- package/dist/eval.cjs +192 -16
- package/dist/eval.cjs.map +1 -1
- package/dist/eval.d.cts +2 -0
- package/dist/eval.d.ts +2 -0
- package/dist/eval.js +191 -18
- package/dist/eval.js.map +1 -1
- package/dist/index.cjs +31 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +40 -5
- package/dist/index.d.ts +40 -5
- package/dist/index.js +31 -9
- package/dist/index.js.map +1 -1
- package/dist/internal/eval/code-runner.d.ts +28 -0
- package/dist/internal/persistence/index.cjs +68 -0
- package/dist/internal/persistence/index.cjs.map +1 -1
- package/dist/internal/persistence/index.d.cts +1 -0
- package/dist/internal/persistence/index.d.ts +1 -0
- package/dist/internal/persistence/index.js +65 -1
- package/dist/internal/persistence/index.js.map +1 -1
- package/dist/internal/persistence/jsonl.d.cts +34 -0
- package/dist/internal/persistence/jsonl.d.ts +34 -0
- package/dist/permission-engine.d.ts +12 -4
- package/dist/persistence.cjs +318 -0
- package/dist/persistence.cjs.map +1 -0
- package/dist/persistence.d.cts +24 -0
- package/dist/persistence.d.ts +24 -0
- package/dist/persistence.js +306 -0
- package/dist/persistence.js.map +1 -0
- package/dist/sandbox/index.cjs +71 -1
- package/dist/sandbox/index.cjs.map +1 -1
- package/dist/sandbox/index.d.cts +1 -0
- package/dist/sandbox/index.d.ts +1 -0
- package/dist/sandbox/index.js +70 -2
- package/dist/sandbox/index.js.map +1 -1
- package/dist/sandbox/provision.d.cts +53 -0
- package/dist/sandbox/provision.d.ts +53 -0
- package/dist/sandbox/shell-escape.d.cts +8 -0
- package/dist/sandbox/shell-escape.d.ts +8 -0
- package/dist/scorers.d.ts +19 -1
- package/dist/types/eval.d.ts +71 -0
- package/package.json +11 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,27 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 2.6.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- edbc3c2: Add the public `@theokit/sdk/persistence` sub-path (V2-3 — Theo Harness Capability Map). Promotes the consumer-grade persistence helpers from the semver-exempt `internal/persistence` to a STABLE, semver-protected surface: `appendJsonl` / `readJsonlIds` / `loadJsonl` (durable JSONL persist + resume), `replaceFileAtomic` / `atomicWriteText` / `atomicWriteJson` (audited atomic write — fsync, 0o600, crypto-random temp), `withFileLock` (cross-process lock), and `openSqliteResilient` / `applyWalWithFallback` / `isCorruptionError` (resilient SQLite bootstrap). Several were extracted from a real consumer (the SWE-bench eval harness); this sub-path lets consumers adopt them without coupling to `internal/`.
|
|
8
|
+
|
|
9
|
+
## 2.5.0
|
|
10
|
+
|
|
11
|
+
### Minor Changes
|
|
12
|
+
|
|
13
|
+
- 301d4a3: Eval harness (M6, Tema E): first-party SWE-bench-style primitives over the existing `Eval`/`Scorers`/`SandboxBackend` surface, with zero new runtime dependencies.
|
|
14
|
+
|
|
15
|
+
- `loadJsonl(path, { map? })` from `@theokit/sdk/eval` — generic JSONL dataset loader with line-numbered `JsonlParseError`; the dataset schema is the caller's via `map`.
|
|
16
|
+
- Durable batch: `Eval.run({ persist: { path, key, resume }, classify })` flushes each row the instant it completes and resumes a crashed run by skipping already-persisted rows.
|
|
17
|
+
- `provisionRepo(sandbox, { repoUrl, ref, instanceId })` + `RepoProvisionError` — portable git clone+checkout over `SandboxBackend.execute`.
|
|
18
|
+
- `Scorers.verifyGate({ failToPass, passToPass })` — grades a patch by test exit-code via the sandbox; `EvalRowResult.artifact` carries `{ diff, applies }`.
|
|
19
|
+
|
|
20
|
+
- 32180fe: M7 (Tema F) SDK slice — PermissionEngine default-deny + plugin wiring.
|
|
21
|
+
|
|
22
|
+
- `PermissionEngine` now takes `{ defaultAction }` (default `"allow"`, backward-compatible) — opt into default-deny with `new PermissionEngine(rules, { defaultAction: "deny" })`. `PermissionAction`/`PermissionRule`/`PermissionEngineOptions` types are now exported.
|
|
23
|
+
- New `createPermissionPlugin(engine, opts?)` wires a `PermissionEngine` into the `definePlugin` `pre_tool_call` veto (the engine was previously exported-but-unwired): `deny` blocks, `ask` defers to `opts.onAsk` (fail-closed block by default), `allow` passes.
|
|
24
|
+
|
|
3
25
|
## 2.4.0
|
|
4
26
|
|
|
5
27
|
### Minor Changes
|
package/README.md
CHANGED
|
@@ -26,6 +26,10 @@
|
|
|
26
26
|
|
|
27
27
|
For the full reference, see the [root README](../../README.md) and [`docs.md`](../../docs.md).
|
|
28
28
|
|
|
29
|
+
## Capability map
|
|
30
|
+
|
|
31
|
+
New here? The [**Theo Harness Capability Map**](../../docs/harness-capability-map.md) is the discovery front-door — every harness primitive with its import path, signature, and a one-line example (find `compactTranscript`, `buildRepoMap`, `isTransientError`, `@theokit/sdk/persistence`, ... without reading source). The exhaustive contract is [`docs.md`](../../docs.md).
|
|
32
|
+
|
|
29
33
|
## Install
|
|
30
34
|
|
|
31
35
|
```bash
|
package/dist/eval.cjs
CHANGED
|
@@ -15538,6 +15538,69 @@ setAgentFacade({
|
|
|
15538
15538
|
resume: (agentId, options) => Agent.resume(agentId, options),
|
|
15539
15539
|
batch: (prompts, options) => Agent.batch(prompts, options)
|
|
15540
15540
|
});
|
|
15541
|
+
var JsonlParseError = class extends Error {
|
|
15542
|
+
constructor(message, line) {
|
|
15543
|
+
super(message);
|
|
15544
|
+
this.line = line;
|
|
15545
|
+
this.name = "JsonlParseError";
|
|
15546
|
+
}
|
|
15547
|
+
line;
|
|
15548
|
+
};
|
|
15549
|
+
function isPlainObject(value) {
|
|
15550
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
15551
|
+
}
|
|
15552
|
+
function tryParseObjectLine(line) {
|
|
15553
|
+
if (line.length === 0) return void 0;
|
|
15554
|
+
let parsed;
|
|
15555
|
+
try {
|
|
15556
|
+
parsed = JSON.parse(line);
|
|
15557
|
+
} catch {
|
|
15558
|
+
return void 0;
|
|
15559
|
+
}
|
|
15560
|
+
return isPlainObject(parsed) ? parsed : void 0;
|
|
15561
|
+
}
|
|
15562
|
+
function loadJsonl(path, opts = {}) {
|
|
15563
|
+
const text = fs.readFileSync(path, "utf8");
|
|
15564
|
+
const out = [];
|
|
15565
|
+
let lineNumber = 0;
|
|
15566
|
+
for (const rawLine of text.split("\n")) {
|
|
15567
|
+
lineNumber += 1;
|
|
15568
|
+
const line = rawLine.trim();
|
|
15569
|
+
if (line.length === 0) continue;
|
|
15570
|
+
let parsed;
|
|
15571
|
+
try {
|
|
15572
|
+
parsed = JSON.parse(line);
|
|
15573
|
+
} catch {
|
|
15574
|
+
throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
|
|
15575
|
+
}
|
|
15576
|
+
if (!isPlainObject(parsed)) {
|
|
15577
|
+
throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
|
|
15578
|
+
}
|
|
15579
|
+
out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
|
|
15580
|
+
}
|
|
15581
|
+
return out;
|
|
15582
|
+
}
|
|
15583
|
+
function appendJsonl(path$1, record) {
|
|
15584
|
+
fs.mkdirSync(path.dirname(path$1), { recursive: true });
|
|
15585
|
+
fs.appendFileSync(path$1, `${JSON.stringify(record)}
|
|
15586
|
+
`);
|
|
15587
|
+
}
|
|
15588
|
+
function readJsonlIds(path, keyFn) {
|
|
15589
|
+
const done = /* @__PURE__ */ new Set();
|
|
15590
|
+
let text;
|
|
15591
|
+
try {
|
|
15592
|
+
text = fs.readFileSync(path, "utf8");
|
|
15593
|
+
} catch {
|
|
15594
|
+
return done;
|
|
15595
|
+
}
|
|
15596
|
+
for (const rawLine of text.split("\n")) {
|
|
15597
|
+
const parsed = tryParseObjectLine(rawLine.trim());
|
|
15598
|
+
if (parsed === void 0) continue;
|
|
15599
|
+
const key = keyFn(parsed);
|
|
15600
|
+
if (typeof key === "string" && key.length > 0) done.add(key);
|
|
15601
|
+
}
|
|
15602
|
+
return done;
|
|
15603
|
+
}
|
|
15541
15604
|
|
|
15542
15605
|
// src/internal/eval/runner.ts
|
|
15543
15606
|
init_agent_factory_registry();
|
|
@@ -15729,6 +15792,50 @@ function normalizeScorers(input) {
|
|
|
15729
15792
|
return { name: s.name, score: s.score };
|
|
15730
15793
|
});
|
|
15731
15794
|
}
|
|
15795
|
+
function probeRow(entry, index) {
|
|
15796
|
+
return {
|
|
15797
|
+
index,
|
|
15798
|
+
input: entry.input,
|
|
15799
|
+
output: "",
|
|
15800
|
+
...entry.expected !== void 0 ? { expected: entry.expected } : {},
|
|
15801
|
+
scores: [],
|
|
15802
|
+
meanScore: 0,
|
|
15803
|
+
durationMs: 0,
|
|
15804
|
+
...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
|
|
15805
|
+
};
|
|
15806
|
+
}
|
|
15807
|
+
function computeDoneKeys(persist) {
|
|
15808
|
+
if (persist.resume !== true) return /* @__PURE__ */ new Set();
|
|
15809
|
+
return readJsonlIds(
|
|
15810
|
+
persist.path,
|
|
15811
|
+
(parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
|
|
15812
|
+
);
|
|
15813
|
+
}
|
|
15814
|
+
function appendRowSafely(path, row) {
|
|
15815
|
+
try {
|
|
15816
|
+
appendJsonl(path, row);
|
|
15817
|
+
} catch (err) {
|
|
15818
|
+
console.warn(
|
|
15819
|
+
"[eval] persist append failed (ignored):",
|
|
15820
|
+
err instanceof Error ? err.message : err
|
|
15821
|
+
);
|
|
15822
|
+
}
|
|
15823
|
+
}
|
|
15824
|
+
function makeRowSink(persist, classify) {
|
|
15825
|
+
const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
|
|
15826
|
+
return {
|
|
15827
|
+
isResumed(entry, index) {
|
|
15828
|
+
if (persist === void 0 || doneKeys.size === 0) return false;
|
|
15829
|
+
return doneKeys.has(persist.key(probeRow(entry, index)));
|
|
15830
|
+
},
|
|
15831
|
+
finalize(row) {
|
|
15832
|
+
const outcome = classify?.(row);
|
|
15833
|
+
const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
|
|
15834
|
+
if (persist !== void 0) appendRowSafely(persist.path, finalRow);
|
|
15835
|
+
return finalRow;
|
|
15836
|
+
}
|
|
15837
|
+
};
|
|
15838
|
+
}
|
|
15732
15839
|
async function applyScorer(scorer, output, expected) {
|
|
15733
15840
|
let raw;
|
|
15734
15841
|
try {
|
|
@@ -15774,7 +15881,15 @@ function makeAgentForBatch(spec, _entries) {
|
|
|
15774
15881
|
}
|
|
15775
15882
|
return spec;
|
|
15776
15883
|
}
|
|
15777
|
-
async function
|
|
15884
|
+
async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
|
|
15885
|
+
const entry = entries[idx];
|
|
15886
|
+
if (entry === void 0) return;
|
|
15887
|
+
if (sink.isResumed(entry, idx)) return;
|
|
15888
|
+
const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
|
|
15889
|
+
rows[idx] = row;
|
|
15890
|
+
onRow(row, idx);
|
|
15891
|
+
}
|
|
15892
|
+
async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
|
|
15778
15893
|
const rows = new Array(entries.length);
|
|
15779
15894
|
const state2 = { cursor: 0 };
|
|
15780
15895
|
const worker = async () => {
|
|
@@ -15782,11 +15897,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
|
|
|
15782
15897
|
if (signal?.aborted === true) return;
|
|
15783
15898
|
const idx = state2.cursor;
|
|
15784
15899
|
state2.cursor += 1;
|
|
15785
|
-
|
|
15786
|
-
if (entry === void 0) continue;
|
|
15787
|
-
const row = await runOneEntry(spec, entry, idx, scorers);
|
|
15788
|
-
rows[idx] = row;
|
|
15789
|
-
onRow(row, idx);
|
|
15900
|
+
await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
|
|
15790
15901
|
}
|
|
15791
15902
|
};
|
|
15792
15903
|
const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
|
|
@@ -15836,23 +15947,32 @@ async function scoreBatchOutput(br, expected, scorers) {
|
|
|
15836
15947
|
}
|
|
15837
15948
|
return scoreEntries;
|
|
15838
15949
|
}
|
|
15839
|
-
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
|
|
15840
|
-
const
|
|
15950
|
+
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
|
|
15951
|
+
const pending = [];
|
|
15952
|
+
for (let i = 0; i < entries.length; i += 1) {
|
|
15953
|
+
const entry = entries[i];
|
|
15954
|
+
if (entry === void 0) continue;
|
|
15955
|
+
if (sink.isResumed(entry, i)) continue;
|
|
15956
|
+
pending.push({ entry, index: i });
|
|
15957
|
+
}
|
|
15841
15958
|
const batchOpts = {
|
|
15842
15959
|
...agentOptions,
|
|
15843
15960
|
concurrency,
|
|
15844
15961
|
...signal !== void 0 ? { signal } : {}
|
|
15845
15962
|
};
|
|
15846
|
-
const batchResults = await getAgentFacade().batch(
|
|
15963
|
+
const batchResults = await getAgentFacade().batch(
|
|
15964
|
+
pending.map((p) => p.entry.input),
|
|
15965
|
+
batchOpts
|
|
15966
|
+
);
|
|
15847
15967
|
const rows = [];
|
|
15848
15968
|
for (let i = 0; i < batchResults.length; i += 1) {
|
|
15849
|
-
const
|
|
15969
|
+
const slot = pending[i];
|
|
15850
15970
|
const br = batchResults[i];
|
|
15851
|
-
if (
|
|
15852
|
-
const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
|
|
15853
|
-
const row = rowFromBatchResult(entry, br, scoreEntries,
|
|
15971
|
+
if (slot === void 0 || br === void 0) continue;
|
|
15972
|
+
const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
|
|
15973
|
+
const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
|
|
15854
15974
|
rows.push(row);
|
|
15855
|
-
onRow(row,
|
|
15975
|
+
onRow(row, slot.index);
|
|
15856
15976
|
}
|
|
15857
15977
|
return rows;
|
|
15858
15978
|
}
|
|
@@ -15877,6 +15997,7 @@ async function runEval(options, runOpts) {
|
|
|
15877
15997
|
const onRow = (row, i) => {
|
|
15878
15998
|
safeHook(() => hooks?.afterRow?.(row, i));
|
|
15879
15999
|
};
|
|
16000
|
+
const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
|
|
15880
16001
|
let rows;
|
|
15881
16002
|
if (isAgentInstance(options.agent) || typeof options.agent === "function") {
|
|
15882
16003
|
rows = await runRowsManually(
|
|
@@ -15885,11 +16006,12 @@ async function runEval(options, runOpts) {
|
|
|
15885
16006
|
scorers,
|
|
15886
16007
|
concurrency,
|
|
15887
16008
|
signal,
|
|
15888
|
-
onRow
|
|
16009
|
+
onRow,
|
|
16010
|
+
sink
|
|
15889
16011
|
);
|
|
15890
16012
|
} else {
|
|
15891
16013
|
const batchOpts = makeAgentForBatch(options.agent, indexed);
|
|
15892
|
-
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
|
|
16014
|
+
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
|
|
15893
16015
|
}
|
|
15894
16016
|
const aggregate = computeAggregate(rows);
|
|
15895
16017
|
const endedAt = Date.now();
|
|
@@ -15915,6 +16037,25 @@ async function runEval(options, runOpts) {
|
|
|
15915
16037
|
}
|
|
15916
16038
|
}
|
|
15917
16039
|
|
|
16040
|
+
// src/sandbox/shell-escape.ts
|
|
16041
|
+
function shellEscapePosix(arg) {
|
|
16042
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
16043
|
+
}
|
|
16044
|
+
|
|
16045
|
+
// src/internal/eval/code-runner.ts
|
|
16046
|
+
var ARTIFACT_PATCH = ".theo-artifact.patch";
|
|
16047
|
+
async function captureArtifact(sandbox, repoDir) {
|
|
16048
|
+
const dir = shellEscapePosix(repoDir);
|
|
16049
|
+
const diffRes = await sandbox.execute(`git -C ${dir} diff`);
|
|
16050
|
+
const diff = diffRes.stdout;
|
|
16051
|
+
if (diff.length === 0) return { diff: "", applies: false };
|
|
16052
|
+
await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
|
|
16053
|
+
const check = await sandbox.execute(
|
|
16054
|
+
`git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
|
|
16055
|
+
);
|
|
16056
|
+
return { diff, applies: check.exitCode === 0 };
|
|
16057
|
+
}
|
|
16058
|
+
|
|
15918
16059
|
// src/internal/scorers/llm-judge.ts
|
|
15919
16060
|
init_agent_factory_registry();
|
|
15920
16061
|
function buildPrompt(subject, criteria, rubric, expected) {
|
|
@@ -16057,6 +16198,38 @@ var Scorers = {
|
|
|
16057
16198
|
}
|
|
16058
16199
|
};
|
|
16059
16200
|
},
|
|
16201
|
+
/**
|
|
16202
|
+
* Verify-gate scorer (M6-2): runs the project's tests in the provisioned
|
|
16203
|
+
* repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
|
|
16204
|
+
* else `0` with the exit code + truncated stderr in `reason`. Grades the
|
|
16205
|
+
* artifact captured by `captureArtifact` (D2 — rides `execute`, never a
|
|
16206
|
+
* direct `child_process`).
|
|
16207
|
+
*
|
|
16208
|
+
* SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
|
|
16209
|
+
* of the (potentially untrusted, dataset-derived) test identifiers. There is
|
|
16210
|
+
* NO default that runs bare test names — that would interpolate untrusted
|
|
16211
|
+
* `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
|
|
16212
|
+
* by the SDK; the test list is the builder's responsibility to render safely.
|
|
16213
|
+
*
|
|
16214
|
+
* PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
|
|
16215
|
+
* assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
|
|
16216
|
+
* that rejects shell metacharacters in `execute` is unsupported for this scorer.
|
|
16217
|
+
*/
|
|
16218
|
+
verifyGate(opts) {
|
|
16219
|
+
const { sandbox, repoDir, failToPass, passToPass, command } = opts;
|
|
16220
|
+
return {
|
|
16221
|
+
name: "verify-gate",
|
|
16222
|
+
score: async () => {
|
|
16223
|
+
const cmd = command([...failToPass, ...passToPass]).trim();
|
|
16224
|
+
if (cmd.length === 0) {
|
|
16225
|
+
return { score: 0, reason: "verify_gate_empty_command" };
|
|
16226
|
+
}
|
|
16227
|
+
const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
|
|
16228
|
+
if (r.exitCode === 0) return { score: 1 };
|
|
16229
|
+
return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
|
|
16230
|
+
}
|
|
16231
|
+
};
|
|
16232
|
+
},
|
|
16060
16233
|
jsonShape(schema, opts = {}) {
|
|
16061
16234
|
return {
|
|
16062
16235
|
name: "json-shape",
|
|
@@ -16131,6 +16304,9 @@ var Eval = class _Eval {
|
|
|
16131
16304
|
|
|
16132
16305
|
exports.Eval = Eval;
|
|
16133
16306
|
exports.EvalAlreadyRunningError = EvalAlreadyRunningError;
|
|
16307
|
+
exports.JsonlParseError = JsonlParseError;
|
|
16134
16308
|
exports.Scorers = Scorers;
|
|
16309
|
+
exports.captureArtifact = captureArtifact;
|
|
16310
|
+
exports.loadJsonl = loadJsonl;
|
|
16135
16311
|
//# sourceMappingURL=eval.cjs.map
|
|
16136
16312
|
//# sourceMappingURL=eval.cjs.map
|