@theokit/sdk 2.4.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/dist/eval.cjs +192 -16
- package/dist/eval.cjs.map +1 -1
- package/dist/eval.d.cts +2 -0
- package/dist/eval.d.ts +2 -0
- package/dist/eval.js +191 -18
- package/dist/eval.js.map +1 -1
- package/dist/index.cjs +31 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +40 -5
- package/dist/index.d.ts +40 -5
- package/dist/index.js +31 -9
- package/dist/index.js.map +1 -1
- package/dist/internal/eval/code-runner.d.ts +28 -0
- package/dist/internal/persistence/index.cjs +68 -0
- package/dist/internal/persistence/index.cjs.map +1 -1
- package/dist/internal/persistence/index.d.cts +1 -0
- package/dist/internal/persistence/index.d.ts +1 -0
- package/dist/internal/persistence/index.js +65 -1
- package/dist/internal/persistence/index.js.map +1 -1
- package/dist/internal/persistence/jsonl.d.cts +34 -0
- package/dist/internal/persistence/jsonl.d.ts +34 -0
- package/dist/permission-engine.d.ts +12 -4
- package/dist/sandbox/index.cjs +71 -1
- package/dist/sandbox/index.cjs.map +1 -1
- package/dist/sandbox/index.d.cts +1 -0
- package/dist/sandbox/index.d.ts +1 -0
- package/dist/sandbox/index.js +70 -2
- package/dist/sandbox/index.js.map +1 -1
- package/dist/sandbox/provision.d.cts +53 -0
- package/dist/sandbox/provision.d.ts +53 -0
- package/dist/sandbox/shell-escape.d.cts +8 -0
- package/dist/sandbox/shell-escape.d.ts +8 -0
- package/dist/scorers.d.ts +19 -1
- package/dist/types/eval.d.ts +71 -0
- package/package.json +14 -14
package/dist/eval.d.cts
CHANGED
|
@@ -31,6 +31,8 @@ export declare class Eval {
|
|
|
31
31
|
*/
|
|
32
32
|
run(runOpts?: EvalRunOptions): Promise<EvalRun>;
|
|
33
33
|
}
|
|
34
|
+
export { captureArtifact } from "./internal/eval/code-runner.js";
|
|
34
35
|
export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
|
|
36
|
+
export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
|
|
35
37
|
export { Scorers } from "./scorers.js";
|
|
36
38
|
export type * from "./types/eval.js";
|
package/dist/eval.d.ts
CHANGED
|
@@ -31,6 +31,8 @@ export declare class Eval {
|
|
|
31
31
|
*/
|
|
32
32
|
run(runOpts?: EvalRunOptions): Promise<EvalRun>;
|
|
33
33
|
}
|
|
34
|
+
export { captureArtifact } from "./internal/eval/code-runner.js";
|
|
34
35
|
export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
|
|
36
|
+
export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
|
|
35
37
|
export { Scorers } from "./scorers.js";
|
|
36
38
|
export type * from "./types/eval.js";
|
package/dist/eval.js
CHANGED
|
@@ -2,7 +2,7 @@ import { randomUUID, randomBytes, createHash } from 'crypto';
|
|
|
2
2
|
import { readFile, unlink, mkdir, open, rename, statfs, stat, rm, readdir, appendFile, access } from 'fs/promises';
|
|
3
3
|
import { join, dirname, resolve, sep, relative, isAbsolute } from 'path';
|
|
4
4
|
import { z, toJSONSchema } from 'zod';
|
|
5
|
-
import { mkdirSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync
|
|
5
|
+
import { readFileSync, mkdirSync, appendFileSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync } from 'fs';
|
|
6
6
|
import { AsyncLocalStorage } from 'async_hooks';
|
|
7
7
|
import { createRequire } from 'module';
|
|
8
8
|
import { homedir } from 'os';
|
|
@@ -15535,6 +15535,69 @@ setAgentFacade({
|
|
|
15535
15535
|
resume: (agentId, options) => Agent.resume(agentId, options),
|
|
15536
15536
|
batch: (prompts, options) => Agent.batch(prompts, options)
|
|
15537
15537
|
});
|
|
15538
|
+
var JsonlParseError = class extends Error {
|
|
15539
|
+
constructor(message, line) {
|
|
15540
|
+
super(message);
|
|
15541
|
+
this.line = line;
|
|
15542
|
+
this.name = "JsonlParseError";
|
|
15543
|
+
}
|
|
15544
|
+
line;
|
|
15545
|
+
};
|
|
15546
|
+
function isPlainObject(value) {
|
|
15547
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
15548
|
+
}
|
|
15549
|
+
function tryParseObjectLine(line) {
|
|
15550
|
+
if (line.length === 0) return void 0;
|
|
15551
|
+
let parsed;
|
|
15552
|
+
try {
|
|
15553
|
+
parsed = JSON.parse(line);
|
|
15554
|
+
} catch {
|
|
15555
|
+
return void 0;
|
|
15556
|
+
}
|
|
15557
|
+
return isPlainObject(parsed) ? parsed : void 0;
|
|
15558
|
+
}
|
|
15559
|
+
function loadJsonl(path, opts = {}) {
|
|
15560
|
+
const text = readFileSync(path, "utf8");
|
|
15561
|
+
const out = [];
|
|
15562
|
+
let lineNumber = 0;
|
|
15563
|
+
for (const rawLine of text.split("\n")) {
|
|
15564
|
+
lineNumber += 1;
|
|
15565
|
+
const line = rawLine.trim();
|
|
15566
|
+
if (line.length === 0) continue;
|
|
15567
|
+
let parsed;
|
|
15568
|
+
try {
|
|
15569
|
+
parsed = JSON.parse(line);
|
|
15570
|
+
} catch {
|
|
15571
|
+
throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
|
|
15572
|
+
}
|
|
15573
|
+
if (!isPlainObject(parsed)) {
|
|
15574
|
+
throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
|
|
15575
|
+
}
|
|
15576
|
+
out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
|
|
15577
|
+
}
|
|
15578
|
+
return out;
|
|
15579
|
+
}
|
|
15580
|
+
function appendJsonl(path, record) {
|
|
15581
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
15582
|
+
appendFileSync(path, `${JSON.stringify(record)}
|
|
15583
|
+
`);
|
|
15584
|
+
}
|
|
15585
|
+
function readJsonlIds(path, keyFn) {
|
|
15586
|
+
const done = /* @__PURE__ */ new Set();
|
|
15587
|
+
let text;
|
|
15588
|
+
try {
|
|
15589
|
+
text = readFileSync(path, "utf8");
|
|
15590
|
+
} catch {
|
|
15591
|
+
return done;
|
|
15592
|
+
}
|
|
15593
|
+
for (const rawLine of text.split("\n")) {
|
|
15594
|
+
const parsed = tryParseObjectLine(rawLine.trim());
|
|
15595
|
+
if (parsed === void 0) continue;
|
|
15596
|
+
const key = keyFn(parsed);
|
|
15597
|
+
if (typeof key === "string" && key.length > 0) done.add(key);
|
|
15598
|
+
}
|
|
15599
|
+
return done;
|
|
15600
|
+
}
|
|
15538
15601
|
|
|
15539
15602
|
// src/internal/eval/runner.ts
|
|
15540
15603
|
init_agent_factory_registry();
|
|
@@ -15726,6 +15789,50 @@ function normalizeScorers(input) {
|
|
|
15726
15789
|
return { name: s.name, score: s.score };
|
|
15727
15790
|
});
|
|
15728
15791
|
}
|
|
15792
|
+
function probeRow(entry, index) {
|
|
15793
|
+
return {
|
|
15794
|
+
index,
|
|
15795
|
+
input: entry.input,
|
|
15796
|
+
output: "",
|
|
15797
|
+
...entry.expected !== void 0 ? { expected: entry.expected } : {},
|
|
15798
|
+
scores: [],
|
|
15799
|
+
meanScore: 0,
|
|
15800
|
+
durationMs: 0,
|
|
15801
|
+
...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
|
|
15802
|
+
};
|
|
15803
|
+
}
|
|
15804
|
+
function computeDoneKeys(persist) {
|
|
15805
|
+
if (persist.resume !== true) return /* @__PURE__ */ new Set();
|
|
15806
|
+
return readJsonlIds(
|
|
15807
|
+
persist.path,
|
|
15808
|
+
(parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
|
|
15809
|
+
);
|
|
15810
|
+
}
|
|
15811
|
+
function appendRowSafely(path, row) {
|
|
15812
|
+
try {
|
|
15813
|
+
appendJsonl(path, row);
|
|
15814
|
+
} catch (err) {
|
|
15815
|
+
console.warn(
|
|
15816
|
+
"[eval] persist append failed (ignored):",
|
|
15817
|
+
err instanceof Error ? err.message : err
|
|
15818
|
+
);
|
|
15819
|
+
}
|
|
15820
|
+
}
|
|
15821
|
+
function makeRowSink(persist, classify) {
|
|
15822
|
+
const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
|
|
15823
|
+
return {
|
|
15824
|
+
isResumed(entry, index) {
|
|
15825
|
+
if (persist === void 0 || doneKeys.size === 0) return false;
|
|
15826
|
+
return doneKeys.has(persist.key(probeRow(entry, index)));
|
|
15827
|
+
},
|
|
15828
|
+
finalize(row) {
|
|
15829
|
+
const outcome = classify?.(row);
|
|
15830
|
+
const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
|
|
15831
|
+
if (persist !== void 0) appendRowSafely(persist.path, finalRow);
|
|
15832
|
+
return finalRow;
|
|
15833
|
+
}
|
|
15834
|
+
};
|
|
15835
|
+
}
|
|
15729
15836
|
async function applyScorer(scorer, output, expected) {
|
|
15730
15837
|
let raw;
|
|
15731
15838
|
try {
|
|
@@ -15771,7 +15878,15 @@ function makeAgentForBatch(spec, _entries) {
|
|
|
15771
15878
|
}
|
|
15772
15879
|
return spec;
|
|
15773
15880
|
}
|
|
15774
|
-
async function
|
|
15881
|
+
async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
|
|
15882
|
+
const entry = entries[idx];
|
|
15883
|
+
if (entry === void 0) return;
|
|
15884
|
+
if (sink.isResumed(entry, idx)) return;
|
|
15885
|
+
const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
|
|
15886
|
+
rows[idx] = row;
|
|
15887
|
+
onRow(row, idx);
|
|
15888
|
+
}
|
|
15889
|
+
async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
|
|
15775
15890
|
const rows = new Array(entries.length);
|
|
15776
15891
|
const state2 = { cursor: 0 };
|
|
15777
15892
|
const worker = async () => {
|
|
@@ -15779,11 +15894,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
|
|
|
15779
15894
|
if (signal?.aborted === true) return;
|
|
15780
15895
|
const idx = state2.cursor;
|
|
15781
15896
|
state2.cursor += 1;
|
|
15782
|
-
|
|
15783
|
-
if (entry === void 0) continue;
|
|
15784
|
-
const row = await runOneEntry(spec, entry, idx, scorers);
|
|
15785
|
-
rows[idx] = row;
|
|
15786
|
-
onRow(row, idx);
|
|
15897
|
+
await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
|
|
15787
15898
|
}
|
|
15788
15899
|
};
|
|
15789
15900
|
const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
|
|
@@ -15833,23 +15944,32 @@ async function scoreBatchOutput(br, expected, scorers) {
|
|
|
15833
15944
|
}
|
|
15834
15945
|
return scoreEntries;
|
|
15835
15946
|
}
|
|
15836
|
-
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
|
|
15837
|
-
const
|
|
15947
|
+
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
|
|
15948
|
+
const pending = [];
|
|
15949
|
+
for (let i = 0; i < entries.length; i += 1) {
|
|
15950
|
+
const entry = entries[i];
|
|
15951
|
+
if (entry === void 0) continue;
|
|
15952
|
+
if (sink.isResumed(entry, i)) continue;
|
|
15953
|
+
pending.push({ entry, index: i });
|
|
15954
|
+
}
|
|
15838
15955
|
const batchOpts = {
|
|
15839
15956
|
...agentOptions,
|
|
15840
15957
|
concurrency,
|
|
15841
15958
|
...signal !== void 0 ? { signal } : {}
|
|
15842
15959
|
};
|
|
15843
|
-
const batchResults = await getAgentFacade().batch(
|
|
15960
|
+
const batchResults = await getAgentFacade().batch(
|
|
15961
|
+
pending.map((p) => p.entry.input),
|
|
15962
|
+
batchOpts
|
|
15963
|
+
);
|
|
15844
15964
|
const rows = [];
|
|
15845
15965
|
for (let i = 0; i < batchResults.length; i += 1) {
|
|
15846
|
-
const
|
|
15966
|
+
const slot = pending[i];
|
|
15847
15967
|
const br = batchResults[i];
|
|
15848
|
-
if (
|
|
15849
|
-
const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
|
|
15850
|
-
const row = rowFromBatchResult(entry, br, scoreEntries,
|
|
15968
|
+
if (slot === void 0 || br === void 0) continue;
|
|
15969
|
+
const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
|
|
15970
|
+
const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
|
|
15851
15971
|
rows.push(row);
|
|
15852
|
-
onRow(row,
|
|
15972
|
+
onRow(row, slot.index);
|
|
15853
15973
|
}
|
|
15854
15974
|
return rows;
|
|
15855
15975
|
}
|
|
@@ -15874,6 +15994,7 @@ async function runEval(options, runOpts) {
|
|
|
15874
15994
|
const onRow = (row, i) => {
|
|
15875
15995
|
safeHook(() => hooks?.afterRow?.(row, i));
|
|
15876
15996
|
};
|
|
15997
|
+
const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
|
|
15877
15998
|
let rows;
|
|
15878
15999
|
if (isAgentInstance(options.agent) || typeof options.agent === "function") {
|
|
15879
16000
|
rows = await runRowsManually(
|
|
@@ -15882,11 +16003,12 @@ async function runEval(options, runOpts) {
|
|
|
15882
16003
|
scorers,
|
|
15883
16004
|
concurrency,
|
|
15884
16005
|
signal,
|
|
15885
|
-
onRow
|
|
16006
|
+
onRow,
|
|
16007
|
+
sink
|
|
15886
16008
|
);
|
|
15887
16009
|
} else {
|
|
15888
16010
|
const batchOpts = makeAgentForBatch(options.agent, indexed);
|
|
15889
|
-
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
|
|
16011
|
+
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
|
|
15890
16012
|
}
|
|
15891
16013
|
const aggregate = computeAggregate(rows);
|
|
15892
16014
|
const endedAt = Date.now();
|
|
@@ -15912,6 +16034,25 @@ async function runEval(options, runOpts) {
|
|
|
15912
16034
|
}
|
|
15913
16035
|
}
|
|
15914
16036
|
|
|
16037
|
+
// src/sandbox/shell-escape.ts
|
|
16038
|
+
function shellEscapePosix(arg) {
|
|
16039
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
16040
|
+
}
|
|
16041
|
+
|
|
16042
|
+
// src/internal/eval/code-runner.ts
|
|
16043
|
+
var ARTIFACT_PATCH = ".theo-artifact.patch";
|
|
16044
|
+
async function captureArtifact(sandbox, repoDir) {
|
|
16045
|
+
const dir = shellEscapePosix(repoDir);
|
|
16046
|
+
const diffRes = await sandbox.execute(`git -C ${dir} diff`);
|
|
16047
|
+
const diff = diffRes.stdout;
|
|
16048
|
+
if (diff.length === 0) return { diff: "", applies: false };
|
|
16049
|
+
await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
|
|
16050
|
+
const check = await sandbox.execute(
|
|
16051
|
+
`git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
|
|
16052
|
+
);
|
|
16053
|
+
return { diff, applies: check.exitCode === 0 };
|
|
16054
|
+
}
|
|
16055
|
+
|
|
15915
16056
|
// src/internal/scorers/llm-judge.ts
|
|
15916
16057
|
init_agent_factory_registry();
|
|
15917
16058
|
function buildPrompt(subject, criteria, rubric, expected) {
|
|
@@ -16054,6 +16195,38 @@ var Scorers = {
|
|
|
16054
16195
|
}
|
|
16055
16196
|
};
|
|
16056
16197
|
},
|
|
16198
|
+
/**
|
|
16199
|
+
* Verify-gate scorer (M6-2): runs the project's tests in the provisioned
|
|
16200
|
+
* repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
|
|
16201
|
+
* else `0` with the exit code + truncated stderr in `reason`. Grades the
|
|
16202
|
+
* artifact captured by `captureArtifact` (D2 — rides `execute`, never a
|
|
16203
|
+
* direct `child_process`).
|
|
16204
|
+
*
|
|
16205
|
+
* SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
|
|
16206
|
+
* of the (potentially untrusted, dataset-derived) test identifiers. There is
|
|
16207
|
+
* NO default that runs bare test names — that would interpolate untrusted
|
|
16208
|
+
* `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
|
|
16209
|
+
* by the SDK; the test list is the builder's responsibility to render safely.
|
|
16210
|
+
*
|
|
16211
|
+
* PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
|
|
16212
|
+
* assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
|
|
16213
|
+
* that rejects shell metacharacters in `execute` is unsupported for this scorer.
|
|
16214
|
+
*/
|
|
16215
|
+
verifyGate(opts) {
|
|
16216
|
+
const { sandbox, repoDir, failToPass, passToPass, command } = opts;
|
|
16217
|
+
return {
|
|
16218
|
+
name: "verify-gate",
|
|
16219
|
+
score: async () => {
|
|
16220
|
+
const cmd = command([...failToPass, ...passToPass]).trim();
|
|
16221
|
+
if (cmd.length === 0) {
|
|
16222
|
+
return { score: 0, reason: "verify_gate_empty_command" };
|
|
16223
|
+
}
|
|
16224
|
+
const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
|
|
16225
|
+
if (r.exitCode === 0) return { score: 1 };
|
|
16226
|
+
return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
|
|
16227
|
+
}
|
|
16228
|
+
};
|
|
16229
|
+
},
|
|
16057
16230
|
jsonShape(schema, opts = {}) {
|
|
16058
16231
|
return {
|
|
16059
16232
|
name: "json-shape",
|
|
@@ -16126,6 +16299,6 @@ var Eval = class _Eval {
|
|
|
16126
16299
|
}
|
|
16127
16300
|
};
|
|
16128
16301
|
|
|
16129
|
-
export { Eval, EvalAlreadyRunningError, Scorers };
|
|
16302
|
+
export { Eval, EvalAlreadyRunningError, JsonlParseError, Scorers, captureArtifact, loadJsonl };
|
|
16130
16303
|
//# sourceMappingURL=eval.js.map
|
|
16131
16304
|
//# sourceMappingURL=eval.js.map
|