@theokit/sdk 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # Changelog
2
2
 
3
+ ## 2.5.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 301d4a3: Eval harness (M6, Tema E): first-party SWE-bench-style primitives over the existing `Eval`/`Scorers`/`SandboxBackend` surface, with zero new runtime dependencies.
8
+
9
+ - `loadJsonl(path, { map? })` from `@theokit/sdk/eval` — generic JSONL dataset loader with line-numbered `JsonlParseError`; the dataset schema is the caller's via `map`.
10
+ - Durable batch: `Eval.run({ persist: { path, key, resume }, classify })` flushes each row the instant it completes and resumes a crashed run by skipping already-persisted rows.
11
+ - `provisionRepo(sandbox, { repoUrl, ref, instanceId })` + `RepoProvisionError` — portable git clone+checkout over `SandboxBackend.execute`.
12
+ - `Scorers.verifyGate({ failToPass, passToPass })` — grades a patch by test exit-code via the sandbox; `EvalRowResult.artifact` carries `{ diff, applies }`.
13
+
14
+ - 32180fe: M7 (Tema F) SDK slice — PermissionEngine default-deny + plugin wiring.
15
+
16
+ - `PermissionEngine` now takes `{ defaultAction }` (default `"allow"`, backward-compatible) — opt into default-deny with `new PermissionEngine(rules, { defaultAction: "deny" })`. `PermissionAction`/`PermissionRule`/`PermissionEngineOptions` types are now exported.
17
+ - New `createPermissionPlugin(engine, opts?)` wires a `PermissionEngine` into the `definePlugin` `pre_tool_call` veto (the engine was previously exported-but-unwired): `deny` blocks, `ask` defers to `opts.onAsk` (fail-closed block by default), `allow` passes.
18
+
3
19
  ## 2.4.0
4
20
 
5
21
  ### Minor Changes
package/dist/eval.cjs CHANGED
@@ -15538,6 +15538,69 @@ setAgentFacade({
15538
15538
  resume: (agentId, options) => Agent.resume(agentId, options),
15539
15539
  batch: (prompts, options) => Agent.batch(prompts, options)
15540
15540
  });
15541
+ var JsonlParseError = class extends Error {
15542
+ constructor(message, line) {
15543
+ super(message);
15544
+ this.line = line;
15545
+ this.name = "JsonlParseError";
15546
+ }
15547
+ line;
15548
+ };
15549
+ function isPlainObject(value) {
15550
+ return typeof value === "object" && value !== null && !Array.isArray(value);
15551
+ }
15552
+ function tryParseObjectLine(line) {
15553
+ if (line.length === 0) return void 0;
15554
+ let parsed;
15555
+ try {
15556
+ parsed = JSON.parse(line);
15557
+ } catch {
15558
+ return void 0;
15559
+ }
15560
+ return isPlainObject(parsed) ? parsed : void 0;
15561
+ }
15562
+ function loadJsonl(path, opts = {}) {
15563
+ const text = fs.readFileSync(path, "utf8");
15564
+ const out = [];
15565
+ let lineNumber = 0;
15566
+ for (const rawLine of text.split("\n")) {
15567
+ lineNumber += 1;
15568
+ const line = rawLine.trim();
15569
+ if (line.length === 0) continue;
15570
+ let parsed;
15571
+ try {
15572
+ parsed = JSON.parse(line);
15573
+ } catch {
15574
+ throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
15575
+ }
15576
+ if (!isPlainObject(parsed)) {
15577
+ throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
15578
+ }
15579
+ out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
15580
+ }
15581
+ return out;
15582
+ }
15583
+ function appendJsonl(path$1, record) {
15584
+ fs.mkdirSync(path.dirname(path$1), { recursive: true });
15585
+ fs.appendFileSync(path$1, `${JSON.stringify(record)}
15586
+ `);
15587
+ }
15588
+ function readJsonlIds(path, keyFn) {
15589
+ const done = /* @__PURE__ */ new Set();
15590
+ let text;
15591
+ try {
15592
+ text = fs.readFileSync(path, "utf8");
15593
+ } catch {
15594
+ return done;
15595
+ }
15596
+ for (const rawLine of text.split("\n")) {
15597
+ const parsed = tryParseObjectLine(rawLine.trim());
15598
+ if (parsed === void 0) continue;
15599
+ const key = keyFn(parsed);
15600
+ if (typeof key === "string" && key.length > 0) done.add(key);
15601
+ }
15602
+ return done;
15603
+ }
15541
15604
 
15542
15605
  // src/internal/eval/runner.ts
15543
15606
  init_agent_factory_registry();
@@ -15729,6 +15792,50 @@ function normalizeScorers(input) {
15729
15792
  return { name: s.name, score: s.score };
15730
15793
  });
15731
15794
  }
15795
+ function probeRow(entry, index) {
15796
+ return {
15797
+ index,
15798
+ input: entry.input,
15799
+ output: "",
15800
+ ...entry.expected !== void 0 ? { expected: entry.expected } : {},
15801
+ scores: [],
15802
+ meanScore: 0,
15803
+ durationMs: 0,
15804
+ ...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
15805
+ };
15806
+ }
15807
+ function computeDoneKeys(persist) {
15808
+ if (persist.resume !== true) return /* @__PURE__ */ new Set();
15809
+ return readJsonlIds(
15810
+ persist.path,
15811
+ (parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
15812
+ );
15813
+ }
15814
+ function appendRowSafely(path, row) {
15815
+ try {
15816
+ appendJsonl(path, row);
15817
+ } catch (err) {
15818
+ console.warn(
15819
+ "[eval] persist append failed (ignored):",
15820
+ err instanceof Error ? err.message : err
15821
+ );
15822
+ }
15823
+ }
15824
+ function makeRowSink(persist, classify) {
15825
+ const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
15826
+ return {
15827
+ isResumed(entry, index) {
15828
+ if (persist === void 0 || doneKeys.size === 0) return false;
15829
+ return doneKeys.has(persist.key(probeRow(entry, index)));
15830
+ },
15831
+ finalize(row) {
15832
+ const outcome = classify?.(row);
15833
+ const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
15834
+ if (persist !== void 0) appendRowSafely(persist.path, finalRow);
15835
+ return finalRow;
15836
+ }
15837
+ };
15838
+ }
15732
15839
  async function applyScorer(scorer, output, expected) {
15733
15840
  let raw;
15734
15841
  try {
@@ -15774,7 +15881,15 @@ function makeAgentForBatch(spec, _entries) {
15774
15881
  }
15775
15882
  return spec;
15776
15883
  }
15777
- async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow) {
15884
+ async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
15885
+ const entry = entries[idx];
15886
+ if (entry === void 0) return;
15887
+ if (sink.isResumed(entry, idx)) return;
15888
+ const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
15889
+ rows[idx] = row;
15890
+ onRow(row, idx);
15891
+ }
15892
+ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
15778
15893
  const rows = new Array(entries.length);
15779
15894
  const state2 = { cursor: 0 };
15780
15895
  const worker = async () => {
@@ -15782,11 +15897,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
15782
15897
  if (signal?.aborted === true) return;
15783
15898
  const idx = state2.cursor;
15784
15899
  state2.cursor += 1;
15785
- const entry = entries[idx];
15786
- if (entry === void 0) continue;
15787
- const row = await runOneEntry(spec, entry, idx, scorers);
15788
- rows[idx] = row;
15789
- onRow(row, idx);
15900
+ await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
15790
15901
  }
15791
15902
  };
15792
15903
  const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
@@ -15836,23 +15947,32 @@ async function scoreBatchOutput(br, expected, scorers) {
15836
15947
  }
15837
15948
  return scoreEntries;
15838
15949
  }
15839
- async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
15840
- const prompts = entries.map((e) => e.input);
15950
+ async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
15951
+ const pending = [];
15952
+ for (let i = 0; i < entries.length; i += 1) {
15953
+ const entry = entries[i];
15954
+ if (entry === void 0) continue;
15955
+ if (sink.isResumed(entry, i)) continue;
15956
+ pending.push({ entry, index: i });
15957
+ }
15841
15958
  const batchOpts = {
15842
15959
  ...agentOptions,
15843
15960
  concurrency,
15844
15961
  ...signal !== void 0 ? { signal } : {}
15845
15962
  };
15846
- const batchResults = await getAgentFacade().batch(prompts, batchOpts);
15963
+ const batchResults = await getAgentFacade().batch(
15964
+ pending.map((p) => p.entry.input),
15965
+ batchOpts
15966
+ );
15847
15967
  const rows = [];
15848
15968
  for (let i = 0; i < batchResults.length; i += 1) {
15849
- const entry = entries[i];
15969
+ const slot = pending[i];
15850
15970
  const br = batchResults[i];
15851
- if (entry === void 0 || br === void 0) continue;
15852
- const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
15853
- const row = rowFromBatchResult(entry, br, scoreEntries, i);
15971
+ if (slot === void 0 || br === void 0) continue;
15972
+ const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
15973
+ const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
15854
15974
  rows.push(row);
15855
- onRow(row, i);
15975
+ onRow(row, slot.index);
15856
15976
  }
15857
15977
  return rows;
15858
15978
  }
@@ -15877,6 +15997,7 @@ async function runEval(options, runOpts) {
15877
15997
  const onRow = (row, i) => {
15878
15998
  safeHook(() => hooks?.afterRow?.(row, i));
15879
15999
  };
16000
+ const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
15880
16001
  let rows;
15881
16002
  if (isAgentInstance(options.agent) || typeof options.agent === "function") {
15882
16003
  rows = await runRowsManually(
@@ -15885,11 +16006,12 @@ async function runEval(options, runOpts) {
15885
16006
  scorers,
15886
16007
  concurrency,
15887
16008
  signal,
15888
- onRow
16009
+ onRow,
16010
+ sink
15889
16011
  );
15890
16012
  } else {
15891
16013
  const batchOpts = makeAgentForBatch(options.agent, indexed);
15892
- rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
16014
+ rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
15893
16015
  }
15894
16016
  const aggregate = computeAggregate(rows);
15895
16017
  const endedAt = Date.now();
@@ -15915,6 +16037,25 @@ async function runEval(options, runOpts) {
15915
16037
  }
15916
16038
  }
15917
16039
 
16040
+ // src/sandbox/shell-escape.ts
16041
+ function shellEscapePosix(arg) {
16042
+ return `'${arg.replace(/'/g, "'\\''")}'`;
16043
+ }
16044
+
16045
+ // src/internal/eval/code-runner.ts
16046
+ var ARTIFACT_PATCH = ".theo-artifact.patch";
16047
+ async function captureArtifact(sandbox, repoDir) {
16048
+ const dir = shellEscapePosix(repoDir);
16049
+ const diffRes = await sandbox.execute(`git -C ${dir} diff`);
16050
+ const diff = diffRes.stdout;
16051
+ if (diff.length === 0) return { diff: "", applies: false };
16052
+ await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
16053
+ const check = await sandbox.execute(
16054
+ `git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
16055
+ );
16056
+ return { diff, applies: check.exitCode === 0 };
16057
+ }
16058
+
15918
16059
  // src/internal/scorers/llm-judge.ts
15919
16060
  init_agent_factory_registry();
15920
16061
  function buildPrompt(subject, criteria, rubric, expected) {
@@ -16057,6 +16198,38 @@ var Scorers = {
16057
16198
  }
16058
16199
  };
16059
16200
  },
16201
+ /**
16202
+ * Verify-gate scorer (M6-2): runs the project's tests in the provisioned
16203
+ * repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
16204
+ * else `0` with the exit code + truncated stderr in `reason`. Grades the
16205
+ * artifact captured by `captureArtifact` (D2 — rides `execute`, never a
16206
+ * direct `child_process`).
16207
+ *
16208
+ * SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
16209
+ * of the (potentially untrusted, dataset-derived) test identifiers. There is
16210
+ * NO default that runs bare test names — that would interpolate untrusted
16211
+ * `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
16212
+ * by the SDK; the test list is the builder's responsibility to render safely.
16213
+ *
16214
+ * PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
16215
+ * assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
16216
+ * that rejects shell metacharacters in `execute` is unsupported for this scorer.
16217
+ */
16218
+ verifyGate(opts) {
16219
+ const { sandbox, repoDir, failToPass, passToPass, command } = opts;
16220
+ return {
16221
+ name: "verify-gate",
16222
+ score: async () => {
16223
+ const cmd = command([...failToPass, ...passToPass]).trim();
16224
+ if (cmd.length === 0) {
16225
+ return { score: 0, reason: "verify_gate_empty_command" };
16226
+ }
16227
+ const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
16228
+ if (r.exitCode === 0) return { score: 1 };
16229
+ return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
16230
+ }
16231
+ };
16232
+ },
16060
16233
  jsonShape(schema, opts = {}) {
16061
16234
  return {
16062
16235
  name: "json-shape",
@@ -16131,6 +16304,9 @@ var Eval = class _Eval {
16131
16304
 
16132
16305
  exports.Eval = Eval;
16133
16306
  exports.EvalAlreadyRunningError = EvalAlreadyRunningError;
16307
+ exports.JsonlParseError = JsonlParseError;
16134
16308
  exports.Scorers = Scorers;
16309
+ exports.captureArtifact = captureArtifact;
16310
+ exports.loadJsonl = loadJsonl;
16135
16311
  //# sourceMappingURL=eval.cjs.map
16136
16312
  //# sourceMappingURL=eval.cjs.map