npm - @theokit/sdk - Versions diffs - 2.4.0 → 2.5.0 - Mend

@theokit/sdk 2.4.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/CHANGELOG.md +16 -0
package/dist/eval.cjs +192 -16
package/dist/eval.cjs.map +1 -1
package/dist/eval.d.cts +2 -0
package/dist/eval.d.ts +2 -0
package/dist/eval.js +191 -18
package/dist/eval.js.map +1 -1
package/dist/index.cjs +31 -8
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +40 -5
package/dist/index.d.ts +40 -5
package/dist/index.js +31 -9
package/dist/index.js.map +1 -1
package/dist/internal/eval/code-runner.d.ts +28 -0
package/dist/internal/persistence/index.cjs +68 -0
package/dist/internal/persistence/index.cjs.map +1 -1
package/dist/internal/persistence/index.d.cts +1 -0
package/dist/internal/persistence/index.d.ts +1 -0
package/dist/internal/persistence/index.js +65 -1
package/dist/internal/persistence/index.js.map +1 -1
package/dist/internal/persistence/jsonl.d.cts +34 -0
package/dist/internal/persistence/jsonl.d.ts +34 -0
package/dist/permission-engine.d.ts +12 -4
package/dist/sandbox/index.cjs +71 -1
package/dist/sandbox/index.cjs.map +1 -1
package/dist/sandbox/index.d.cts +1 -0
package/dist/sandbox/index.d.ts +1 -0
package/dist/sandbox/index.js +70 -2
package/dist/sandbox/index.js.map +1 -1
package/dist/sandbox/provision.d.cts +53 -0
package/dist/sandbox/provision.d.ts +53 -0
package/dist/sandbox/shell-escape.d.cts +8 -0
package/dist/sandbox/shell-escape.d.ts +8 -0
package/dist/scorers.d.ts +19 -1
package/dist/types/eval.d.ts +71 -0
package/package.json +14 -14

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,21 @@
 # Changelog
+## 2.5.0
+### Minor Changes
+- 301d4a3: Eval harness (M6, Tema E): first-party SWE-bench-style primitives over the existing `Eval`/`Scorers`/`SandboxBackend` surface, with zero new runtime dependencies.
+  - `loadJsonl(path, { map? })` from `@theokit/sdk/eval` — generic JSONL dataset loader with line-numbered `JsonlParseError`; the dataset schema is the caller's via `map`.
+  - Durable batch: `Eval.run({ persist: { path, key, resume }, classify })` flushes each row the instant it completes and resumes a crashed run by skipping already-persisted rows.
+  - `provisionRepo(sandbox, { repoUrl, ref, instanceId })` + `RepoProvisionError` — portable git clone+checkout over `SandboxBackend.execute`.
+  - `Scorers.verifyGate({ failToPass, passToPass })` — grades a patch by test exit-code via the sandbox; `EvalRowResult.artifact` carries `{ diff, applies }`.
+- 32180fe: M7 (Tema F) SDK slice — PermissionEngine default-deny + plugin wiring.
+  - `PermissionEngine` now takes `{ defaultAction }` (default `"allow"`, backward-compatible) — opt into default-deny with `new PermissionEngine(rules, { defaultAction: "deny" })`. `PermissionAction`/`PermissionRule`/`PermissionEngineOptions` types are now exported.
+  - New `createPermissionPlugin(engine, opts?)` wires a `PermissionEngine` into the `definePlugin` `pre_tool_call` veto (the engine was previously exported-but-unwired): `deny` blocks, `ask` defers to `opts.onAsk` (fail-closed block by default), `allow` passes.
 ## 2.4.0
 ### Minor Changes

package/dist/eval.cjs CHANGED Viewed

@@ -15538,6 +15538,69 @@ setAgentFacade({
   resume: (agentId, options) => Agent.resume(agentId, options),
   batch: (prompts, options) => Agent.batch(prompts, options)
 });
+var JsonlParseError = class extends Error {
+  constructor(message, line) {
+    super(message);
+    this.line = line;
+    this.name = "JsonlParseError";
+  }
+  line;
+};
+function isPlainObject(value) {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function tryParseObjectLine(line) {
+  if (line.length === 0) return void 0;
+  let parsed;
+  try {
+    parsed = JSON.parse(line);
+  } catch {
+    return void 0;
+  }
+  return isPlainObject(parsed) ? parsed : void 0;
+}
+function loadJsonl(path, opts = {}) {
+  const text = fs.readFileSync(path, "utf8");
+  const out = [];
+  let lineNumber = 0;
+  for (const rawLine of text.split("\n")) {
+    lineNumber += 1;
+    const line = rawLine.trim();
+    if (line.length === 0) continue;
+    let parsed;
+    try {
+      parsed = JSON.parse(line);
+    } catch {
+      throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
+    }
+    if (!isPlainObject(parsed)) {
+      throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
+    }
+    out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
+  }
+  return out;
+}
+function appendJsonl(path$1, record) {
+  fs.mkdirSync(path.dirname(path$1), { recursive: true });
+  fs.appendFileSync(path$1, `${JSON.stringify(record)}
+`);
+}
+function readJsonlIds(path, keyFn) {
+  const done = /* @__PURE__ */ new Set();
+  let text;
+  try {
+    text = fs.readFileSync(path, "utf8");
+  } catch {
+    return done;
+  }
+  for (const rawLine of text.split("\n")) {
+    const parsed = tryParseObjectLine(rawLine.trim());
+    if (parsed === void 0) continue;
+    const key = keyFn(parsed);
+    if (typeof key === "string" && key.length > 0) done.add(key);
+  }
+  return done;
+}
 // src/internal/eval/runner.ts
 init_agent_factory_registry();
@@ -15729,6 +15792,50 @@ function normalizeScorers(input) {
     return { name: s.name, score: s.score };
   });
 }
+function probeRow(entry, index) {
+  return {
+    index,
+    input: entry.input,
+    output: "",
+    ...entry.expected !== void 0 ? { expected: entry.expected } : {},
+    scores: [],
+    meanScore: 0,
+    durationMs: 0,
+    ...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
+  };
+}
+function computeDoneKeys(persist) {
+  if (persist.resume !== true) return /* @__PURE__ */ new Set();
+  return readJsonlIds(
+    persist.path,
+    (parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
+  );
+}
+function appendRowSafely(path, row) {
+  try {
+    appendJsonl(path, row);
+  } catch (err) {
+    console.warn(
+      "[eval] persist append failed (ignored):",
+      err instanceof Error ? err.message : err
+    );
+  }
+}
+function makeRowSink(persist, classify) {
+  const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
+  return {
+    isResumed(entry, index) {
+      if (persist === void 0 || doneKeys.size === 0) return false;
+      return doneKeys.has(persist.key(probeRow(entry, index)));
+    },
+    finalize(row) {
+      const outcome = classify?.(row);
+      const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
+      if (persist !== void 0) appendRowSafely(persist.path, finalRow);
+      return finalRow;
+    }
+  };
+}
 async function applyScorer(scorer, output, expected) {
   let raw;
   try {
@@ -15774,7 +15881,15 @@ function makeAgentForBatch(spec, _entries) {
   }
   return spec;
 }
-async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow) {
+async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
+  const entry = entries[idx];
+  if (entry === void 0) return;
+  if (sink.isResumed(entry, idx)) return;
+  const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
+  rows[idx] = row;
+  onRow(row, idx);
+}
+async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
   const rows = new Array(entries.length);
   const state2 = { cursor: 0 };
   const worker = async () => {
@@ -15782,11 +15897,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
       if (signal?.aborted === true) return;
       const idx = state2.cursor;
       state2.cursor += 1;
-      const entry = entries[idx];
-      if (entry === void 0) continue;
-      const row = await runOneEntry(spec, entry, idx, scorers);
-      rows[idx] = row;
-      onRow(row, idx);
+      await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
     }
   };
   const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
@@ -15836,23 +15947,32 @@ async function scoreBatchOutput(br, expected, scorers) {
   }
   return scoreEntries;
 }
-async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
-  const prompts = entries.map((e) => e.input);
+async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
+  const pending = [];
+  for (let i = 0; i < entries.length; i += 1) {
+    const entry = entries[i];
+    if (entry === void 0) continue;
+    if (sink.isResumed(entry, i)) continue;
+    pending.push({ entry, index: i });
+  }
   const batchOpts = {
     ...agentOptions,
     concurrency,
     ...signal !== void 0 ? { signal } : {}
   };
-  const batchResults = await getAgentFacade().batch(prompts, batchOpts);
+  const batchResults = await getAgentFacade().batch(
+    pending.map((p) => p.entry.input),
+    batchOpts
+  );
   const rows = [];
   for (let i = 0; i < batchResults.length; i += 1) {
-    const entry = entries[i];
+    const slot = pending[i];
     const br = batchResults[i];
-    if (entry === void 0 || br === void 0) continue;
-    const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
-    const row = rowFromBatchResult(entry, br, scoreEntries, i);
+    if (slot === void 0 || br === void 0) continue;
+    const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
+    const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
     rows.push(row);
-    onRow(row, i);
+    onRow(row, slot.index);
   }
   return rows;
 }
@@ -15877,6 +15997,7 @@ async function runEval(options, runOpts) {
     const onRow = (row, i) => {
       safeHook(() => hooks?.afterRow?.(row, i));
     };
+    const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
     let rows;
     if (isAgentInstance(options.agent) || typeof options.agent === "function") {
       rows = await runRowsManually(
@@ -15885,11 +16006,12 @@ async function runEval(options, runOpts) {
         scorers,
         concurrency,
         signal,
-        onRow
+        onRow,
+        sink
       );
     } else {
       const batchOpts = makeAgentForBatch(options.agent, indexed);
-      rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
+      rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
     }
     const aggregate = computeAggregate(rows);
     const endedAt = Date.now();
@@ -15915,6 +16037,25 @@ async function runEval(options, runOpts) {
   }
 }
+// src/sandbox/shell-escape.ts
+function shellEscapePosix(arg) {
+  return `'${arg.replace(/'/g, "'\\''")}'`;
+}
+// src/internal/eval/code-runner.ts
+var ARTIFACT_PATCH = ".theo-artifact.patch";
+async function captureArtifact(sandbox, repoDir) {
+  const dir = shellEscapePosix(repoDir);
+  const diffRes = await sandbox.execute(`git -C ${dir} diff`);
+  const diff = diffRes.stdout;
+  if (diff.length === 0) return { diff: "", applies: false };
+  await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
+  const check = await sandbox.execute(
+    `git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
+  );
+  return { diff, applies: check.exitCode === 0 };
+}
 // src/internal/scorers/llm-judge.ts
 init_agent_factory_registry();
 function buildPrompt(subject, criteria, rubric, expected) {
@@ -16057,6 +16198,38 @@ var Scorers = {
       }
     };
   },
+  /**
+   * Verify-gate scorer (M6-2): runs the project's tests in the provisioned
+   * repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
+   * else `0` with the exit code + truncated stderr in `reason`. Grades the
+   * artifact captured by `captureArtifact` (D2 — rides `execute`, never a
+   * direct `child_process`).
+   *
+   * SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
+   * of the (potentially untrusted, dataset-derived) test identifiers. There is
+   * NO default that runs bare test names — that would interpolate untrusted
+   * `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
+   * by the SDK; the test list is the builder's responsibility to render safely.
+   *
+   * PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
+   * assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
+   * that rejects shell metacharacters in `execute` is unsupported for this scorer.
+   */
+  verifyGate(opts) {
+    const { sandbox, repoDir, failToPass, passToPass, command } = opts;
+    return {
+      name: "verify-gate",
+      score: async () => {
+        const cmd = command([...failToPass, ...passToPass]).trim();
+        if (cmd.length === 0) {
+          return { score: 0, reason: "verify_gate_empty_command" };
+        }
+        const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
+        if (r.exitCode === 0) return { score: 1 };
+        return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
+      }
+    };
+  },
   jsonShape(schema, opts = {}) {
     return {
       name: "json-shape",
@@ -16131,6 +16304,9 @@ var Eval = class _Eval {
 exports.Eval = Eval;
 exports.EvalAlreadyRunningError = EvalAlreadyRunningError;
+exports.JsonlParseError = JsonlParseError;
 exports.Scorers = Scorers;
+exports.captureArtifact = captureArtifact;
+exports.loadJsonl = loadJsonl;
 //# sourceMappingURL=eval.cjs.map
 //# sourceMappingURL=eval.cjs.map