@theokit/sdk 2.4.0 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/README.md +4 -0
  3. package/dist/eval.cjs +192 -16
  4. package/dist/eval.cjs.map +1 -1
  5. package/dist/eval.d.cts +2 -0
  6. package/dist/eval.d.ts +2 -0
  7. package/dist/eval.js +191 -18
  8. package/dist/eval.js.map +1 -1
  9. package/dist/index.cjs +31 -8
  10. package/dist/index.cjs.map +1 -1
  11. package/dist/index.d.cts +40 -5
  12. package/dist/index.d.ts +40 -5
  13. package/dist/index.js +31 -9
  14. package/dist/index.js.map +1 -1
  15. package/dist/internal/eval/code-runner.d.ts +28 -0
  16. package/dist/internal/persistence/index.cjs +68 -0
  17. package/dist/internal/persistence/index.cjs.map +1 -1
  18. package/dist/internal/persistence/index.d.cts +1 -0
  19. package/dist/internal/persistence/index.d.ts +1 -0
  20. package/dist/internal/persistence/index.js +65 -1
  21. package/dist/internal/persistence/index.js.map +1 -1
  22. package/dist/internal/persistence/jsonl.d.cts +34 -0
  23. package/dist/internal/persistence/jsonl.d.ts +34 -0
  24. package/dist/permission-engine.d.ts +12 -4
  25. package/dist/persistence.cjs +318 -0
  26. package/dist/persistence.cjs.map +1 -0
  27. package/dist/persistence.d.cts +24 -0
  28. package/dist/persistence.d.ts +24 -0
  29. package/dist/persistence.js +306 -0
  30. package/dist/persistence.js.map +1 -0
  31. package/dist/sandbox/index.cjs +71 -1
  32. package/dist/sandbox/index.cjs.map +1 -1
  33. package/dist/sandbox/index.d.cts +1 -0
  34. package/dist/sandbox/index.d.ts +1 -0
  35. package/dist/sandbox/index.js +70 -2
  36. package/dist/sandbox/index.js.map +1 -1
  37. package/dist/sandbox/provision.d.cts +53 -0
  38. package/dist/sandbox/provision.d.ts +53 -0
  39. package/dist/sandbox/shell-escape.d.cts +8 -0
  40. package/dist/sandbox/shell-escape.d.ts +8 -0
  41. package/dist/scorers.d.ts +19 -1
  42. package/dist/types/eval.d.ts +71 -0
  43. package/package.json +11 -1
package/dist/eval.d.cts CHANGED
@@ -31,6 +31,8 @@ export declare class Eval {
31
31
  */
32
32
  run(runOpts?: EvalRunOptions): Promise<EvalRun>;
33
33
  }
34
+ export { captureArtifact } from "./internal/eval/code-runner.js";
34
35
  export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
36
+ export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
35
37
  export { Scorers } from "./scorers.js";
36
38
  export type * from "./types/eval.js";
package/dist/eval.d.ts CHANGED
@@ -31,6 +31,8 @@ export declare class Eval {
31
31
  */
32
32
  run(runOpts?: EvalRunOptions): Promise<EvalRun>;
33
33
  }
34
+ export { captureArtifact } from "./internal/eval/code-runner.js";
34
35
  export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
36
+ export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
35
37
  export { Scorers } from "./scorers.js";
36
38
  export type * from "./types/eval.js";
package/dist/eval.js CHANGED
@@ -2,7 +2,7 @@ import { randomUUID, randomBytes, createHash } from 'crypto';
2
2
  import { readFile, unlink, mkdir, open, rename, statfs, stat, rm, readdir, appendFile, access } from 'fs/promises';
3
3
  import { join, dirname, resolve, sep, relative, isAbsolute } from 'path';
4
4
  import { z, toJSONSchema } from 'zod';
5
- import { mkdirSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync, readFileSync } from 'fs';
5
+ import { readFileSync, mkdirSync, appendFileSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync } from 'fs';
6
6
  import { AsyncLocalStorage } from 'async_hooks';
7
7
  import { createRequire } from 'module';
8
8
  import { homedir } from 'os';
@@ -15535,6 +15535,69 @@ setAgentFacade({
15535
15535
  resume: (agentId, options) => Agent.resume(agentId, options),
15536
15536
  batch: (prompts, options) => Agent.batch(prompts, options)
15537
15537
  });
15538
+ var JsonlParseError = class extends Error {
15539
+ constructor(message, line) {
15540
+ super(message);
15541
+ this.line = line;
15542
+ this.name = "JsonlParseError";
15543
+ }
15544
+ line;
15545
+ };
15546
+ function isPlainObject(value) {
15547
+ return typeof value === "object" && value !== null && !Array.isArray(value);
15548
+ }
15549
+ function tryParseObjectLine(line) {
15550
+ if (line.length === 0) return void 0;
15551
+ let parsed;
15552
+ try {
15553
+ parsed = JSON.parse(line);
15554
+ } catch {
15555
+ return void 0;
15556
+ }
15557
+ return isPlainObject(parsed) ? parsed : void 0;
15558
+ }
15559
+ function loadJsonl(path, opts = {}) {
15560
+ const text = readFileSync(path, "utf8");
15561
+ const out = [];
15562
+ let lineNumber = 0;
15563
+ for (const rawLine of text.split("\n")) {
15564
+ lineNumber += 1;
15565
+ const line = rawLine.trim();
15566
+ if (line.length === 0) continue;
15567
+ let parsed;
15568
+ try {
15569
+ parsed = JSON.parse(line);
15570
+ } catch {
15571
+ throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
15572
+ }
15573
+ if (!isPlainObject(parsed)) {
15574
+ throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
15575
+ }
15576
+ out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
15577
+ }
15578
+ return out;
15579
+ }
15580
+ function appendJsonl(path, record) {
15581
+ mkdirSync(dirname(path), { recursive: true });
15582
+ appendFileSync(path, `${JSON.stringify(record)}
15583
+ `);
15584
+ }
15585
+ function readJsonlIds(path, keyFn) {
15586
+ const done = /* @__PURE__ */ new Set();
15587
+ let text;
15588
+ try {
15589
+ text = readFileSync(path, "utf8");
15590
+ } catch {
15591
+ return done;
15592
+ }
15593
+ for (const rawLine of text.split("\n")) {
15594
+ const parsed = tryParseObjectLine(rawLine.trim());
15595
+ if (parsed === void 0) continue;
15596
+ const key = keyFn(parsed);
15597
+ if (typeof key === "string" && key.length > 0) done.add(key);
15598
+ }
15599
+ return done;
15600
+ }
15538
15601
 
15539
15602
  // src/internal/eval/runner.ts
15540
15603
  init_agent_factory_registry();
@@ -15726,6 +15789,50 @@ function normalizeScorers(input) {
15726
15789
  return { name: s.name, score: s.score };
15727
15790
  });
15728
15791
  }
15792
+ function probeRow(entry, index) {
15793
+ return {
15794
+ index,
15795
+ input: entry.input,
15796
+ output: "",
15797
+ ...entry.expected !== void 0 ? { expected: entry.expected } : {},
15798
+ scores: [],
15799
+ meanScore: 0,
15800
+ durationMs: 0,
15801
+ ...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
15802
+ };
15803
+ }
15804
+ function computeDoneKeys(persist) {
15805
+ if (persist.resume !== true) return /* @__PURE__ */ new Set();
15806
+ return readJsonlIds(
15807
+ persist.path,
15808
+ (parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
15809
+ );
15810
+ }
15811
+ function appendRowSafely(path, row) {
15812
+ try {
15813
+ appendJsonl(path, row);
15814
+ } catch (err) {
15815
+ console.warn(
15816
+ "[eval] persist append failed (ignored):",
15817
+ err instanceof Error ? err.message : err
15818
+ );
15819
+ }
15820
+ }
15821
+ function makeRowSink(persist, classify) {
15822
+ const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
15823
+ return {
15824
+ isResumed(entry, index) {
15825
+ if (persist === void 0 || doneKeys.size === 0) return false;
15826
+ return doneKeys.has(persist.key(probeRow(entry, index)));
15827
+ },
15828
+ finalize(row) {
15829
+ const outcome = classify?.(row);
15830
+ const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
15831
+ if (persist !== void 0) appendRowSafely(persist.path, finalRow);
15832
+ return finalRow;
15833
+ }
15834
+ };
15835
+ }
15729
15836
  async function applyScorer(scorer, output, expected) {
15730
15837
  let raw;
15731
15838
  try {
@@ -15771,7 +15878,15 @@ function makeAgentForBatch(spec, _entries) {
15771
15878
  }
15772
15879
  return spec;
15773
15880
  }
15774
- async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow) {
15881
+ async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
15882
+ const entry = entries[idx];
15883
+ if (entry === void 0) return;
15884
+ if (sink.isResumed(entry, idx)) return;
15885
+ const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
15886
+ rows[idx] = row;
15887
+ onRow(row, idx);
15888
+ }
15889
+ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
15775
15890
  const rows = new Array(entries.length);
15776
15891
  const state2 = { cursor: 0 };
15777
15892
  const worker = async () => {
@@ -15779,11 +15894,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
15779
15894
  if (signal?.aborted === true) return;
15780
15895
  const idx = state2.cursor;
15781
15896
  state2.cursor += 1;
15782
- const entry = entries[idx];
15783
- if (entry === void 0) continue;
15784
- const row = await runOneEntry(spec, entry, idx, scorers);
15785
- rows[idx] = row;
15786
- onRow(row, idx);
15897
+ await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
15787
15898
  }
15788
15899
  };
15789
15900
  const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
@@ -15833,23 +15944,32 @@ async function scoreBatchOutput(br, expected, scorers) {
15833
15944
  }
15834
15945
  return scoreEntries;
15835
15946
  }
15836
- async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
15837
- const prompts = entries.map((e) => e.input);
15947
+ async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
15948
+ const pending = [];
15949
+ for (let i = 0; i < entries.length; i += 1) {
15950
+ const entry = entries[i];
15951
+ if (entry === void 0) continue;
15952
+ if (sink.isResumed(entry, i)) continue;
15953
+ pending.push({ entry, index: i });
15954
+ }
15838
15955
  const batchOpts = {
15839
15956
  ...agentOptions,
15840
15957
  concurrency,
15841
15958
  ...signal !== void 0 ? { signal } : {}
15842
15959
  };
15843
- const batchResults = await getAgentFacade().batch(prompts, batchOpts);
15960
+ const batchResults = await getAgentFacade().batch(
15961
+ pending.map((p) => p.entry.input),
15962
+ batchOpts
15963
+ );
15844
15964
  const rows = [];
15845
15965
  for (let i = 0; i < batchResults.length; i += 1) {
15846
- const entry = entries[i];
15966
+ const slot = pending[i];
15847
15967
  const br = batchResults[i];
15848
- if (entry === void 0 || br === void 0) continue;
15849
- const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
15850
- const row = rowFromBatchResult(entry, br, scoreEntries, i);
15968
+ if (slot === void 0 || br === void 0) continue;
15969
+ const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
15970
+ const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
15851
15971
  rows.push(row);
15852
- onRow(row, i);
15972
+ onRow(row, slot.index);
15853
15973
  }
15854
15974
  return rows;
15855
15975
  }
@@ -15874,6 +15994,7 @@ async function runEval(options, runOpts) {
15874
15994
  const onRow = (row, i) => {
15875
15995
  safeHook(() => hooks?.afterRow?.(row, i));
15876
15996
  };
15997
+ const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
15877
15998
  let rows;
15878
15999
  if (isAgentInstance(options.agent) || typeof options.agent === "function") {
15879
16000
  rows = await runRowsManually(
@@ -15882,11 +16003,12 @@ async function runEval(options, runOpts) {
15882
16003
  scorers,
15883
16004
  concurrency,
15884
16005
  signal,
15885
- onRow
16006
+ onRow,
16007
+ sink
15886
16008
  );
15887
16009
  } else {
15888
16010
  const batchOpts = makeAgentForBatch(options.agent, indexed);
15889
- rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
16011
+ rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
15890
16012
  }
15891
16013
  const aggregate = computeAggregate(rows);
15892
16014
  const endedAt = Date.now();
@@ -15912,6 +16034,25 @@ async function runEval(options, runOpts) {
15912
16034
  }
15913
16035
  }
15914
16036
 
16037
+ // src/sandbox/shell-escape.ts
16038
+ function shellEscapePosix(arg) {
16039
+ return `'${arg.replace(/'/g, "'\\''")}'`;
16040
+ }
16041
+
16042
+ // src/internal/eval/code-runner.ts
16043
+ var ARTIFACT_PATCH = ".theo-artifact.patch";
16044
+ async function captureArtifact(sandbox, repoDir) {
16045
+ const dir = shellEscapePosix(repoDir);
16046
+ const diffRes = await sandbox.execute(`git -C ${dir} diff`);
16047
+ const diff = diffRes.stdout;
16048
+ if (diff.length === 0) return { diff: "", applies: false };
16049
+ await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
16050
+ const check = await sandbox.execute(
16051
+ `git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
16052
+ );
16053
+ return { diff, applies: check.exitCode === 0 };
16054
+ }
16055
+
15915
16056
  // src/internal/scorers/llm-judge.ts
15916
16057
  init_agent_factory_registry();
15917
16058
  function buildPrompt(subject, criteria, rubric, expected) {
@@ -16054,6 +16195,38 @@ var Scorers = {
16054
16195
  }
16055
16196
  };
16056
16197
  },
16198
+ /**
16199
+ * Verify-gate scorer (M6-2): runs the project's tests in the provisioned
16200
+ * repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
16201
+ * else `0` with the exit code + truncated stderr in `reason`. Grades the
16202
+ * artifact captured by `captureArtifact` (D2 — rides `execute`, never a
16203
+ * direct `child_process`).
16204
+ *
16205
+ * SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
16206
+ * of the (potentially untrusted, dataset-derived) test identifiers. There is
16207
+ * NO default that runs bare test names — that would interpolate untrusted
16208
+ * `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
16209
+ * by the SDK; the test list is the builder's responsibility to render safely.
16210
+ *
16211
+ * PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
16212
+ * assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
16213
+ * that rejects shell metacharacters in `execute` is unsupported for this scorer.
16214
+ */
16215
+ verifyGate(opts) {
16216
+ const { sandbox, repoDir, failToPass, passToPass, command } = opts;
16217
+ return {
16218
+ name: "verify-gate",
16219
+ score: async () => {
16220
+ const cmd = command([...failToPass, ...passToPass]).trim();
16221
+ if (cmd.length === 0) {
16222
+ return { score: 0, reason: "verify_gate_empty_command" };
16223
+ }
16224
+ const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
16225
+ if (r.exitCode === 0) return { score: 1 };
16226
+ return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
16227
+ }
16228
+ };
16229
+ },
16057
16230
  jsonShape(schema, opts = {}) {
16058
16231
  return {
16059
16232
  name: "json-shape",
@@ -16126,6 +16299,6 @@ var Eval = class _Eval {
16126
16299
  }
16127
16300
  };
16128
16301
 
16129
- export { Eval, EvalAlreadyRunningError, Scorers };
16302
+ export { Eval, EvalAlreadyRunningError, JsonlParseError, Scorers, captureArtifact, loadJsonl };
16130
16303
  //# sourceMappingURL=eval.js.map
16131
16304
  //# sourceMappingURL=eval.js.map