@theokit/sdk 2.3.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +113 -0
- package/dist/a2a/index.cjs +103 -48
- package/dist/a2a/index.cjs.map +1 -1
- package/dist/a2a/index.js +104 -49
- package/dist/a2a/index.js.map +1 -1
- package/dist/compaction.cjs +78 -0
- package/dist/compaction.cjs.map +1 -0
- package/dist/compaction.d.cts +76 -0
- package/dist/compaction.d.ts +76 -0
- package/dist/compaction.js +70 -0
- package/dist/compaction.js.map +1 -0
- package/dist/{cron-B_H8rn-j.d.cts → cron-B656C3iq.d.cts} +8 -0
- package/dist/{cron-DX6HbHxd.d.ts → cron-CM2M9mhB.d.ts} +8 -0
- package/dist/cron.cjs +104 -57
- package/dist/cron.cjs.map +1 -1
- package/dist/cron.d.cts +1 -1
- package/dist/cron.d.ts +1 -1
- package/dist/cron.js +104 -57
- package/dist/cron.js.map +1 -1
- package/dist/eval.cjs +296 -73
- package/dist/eval.cjs.map +1 -1
- package/dist/eval.d.cts +2 -0
- package/dist/eval.d.ts +2 -0
- package/dist/eval.js +295 -75
- package/dist/eval.js.map +1 -1
- package/dist/index.cjs +135 -65
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +42 -7
- package/dist/index.d.ts +42 -7
- package/dist/index.js +135 -66
- package/dist/index.js.map +1 -1
- package/dist/internal/agent-loop/loop.d.ts +5 -0
- package/dist/internal/eval/code-runner.d.ts +28 -0
- package/dist/internal/llm/model-capabilities.d.ts +40 -0
- package/dist/internal/llm/model-identifier.d.ts +9 -1
- package/dist/internal/llm/model-option.d.ts +38 -0
- package/dist/internal/persistence/index.cjs +68 -0
- package/dist/internal/persistence/index.cjs.map +1 -1
- package/dist/internal/persistence/index.d.cts +1 -0
- package/dist/internal/persistence/index.d.ts +1 -0
- package/dist/internal/persistence/index.js +65 -1
- package/dist/internal/persistence/index.js.map +1 -1
- package/dist/internal/persistence/jsonl.d.cts +34 -0
- package/dist/internal/persistence/jsonl.d.ts +34 -0
- package/dist/internal/runtime/compression/compression-attempt.d.ts +24 -0
- package/dist/internal/runtime/compression/compression-config.d.ts +33 -0
- package/dist/internal/runtime/compression/compression-decision.d.ts +10 -0
- package/dist/internal/runtime/compression/compression-helpers.d.ts +18 -0
- package/dist/internal/runtime/compression/compression-model-registry.d.ts +41 -0
- package/dist/internal/runtime/compression/compression-summarizer.d.ts +29 -0
- package/dist/internal/runtime/context/project-instructions.d.ts +66 -0
- package/dist/internal/runtime/context/replay-history.d.ts +43 -0
- package/dist/internal/runtime/hooks/hooks-frontmatter.d.ts +1 -1
- package/dist/internal/runtime/skills/discover-skills.d.ts +68 -0
- package/dist/internal/runtime/skills/skills-block.d.ts +18 -0
- package/dist/internal/runtime/skills/subagent-tool-scope.d.ts +25 -0
- package/dist/messages.cjs +24 -0
- package/dist/messages.cjs.map +1 -0
- package/dist/messages.d.cts +33 -0
- package/dist/messages.d.ts +33 -0
- package/dist/messages.js +20 -0
- package/dist/messages.js.map +1 -0
- package/dist/models.cjs +233 -0
- package/dist/models.cjs.map +1 -0
- package/dist/models.d.cts +16 -0
- package/dist/models.d.ts +16 -0
- package/dist/models.js +228 -0
- package/dist/models.js.map +1 -0
- package/dist/permission-engine.d.ts +12 -4
- package/dist/project.cjs +149 -0
- package/dist/project.cjs.map +1 -0
- package/dist/project.d.cts +14 -0
- package/dist/project.d.ts +14 -0
- package/dist/project.js +146 -0
- package/dist/project.js.map +1 -0
- package/dist/sandbox/index.cjs +71 -1
- package/dist/sandbox/index.cjs.map +1 -1
- package/dist/sandbox/index.d.cts +1 -0
- package/dist/sandbox/index.d.ts +1 -0
- package/dist/sandbox/index.js +70 -2
- package/dist/sandbox/index.js.map +1 -1
- package/dist/sandbox/provision.d.cts +53 -0
- package/dist/sandbox/provision.d.ts +53 -0
- package/dist/sandbox/shell-escape.d.cts +8 -0
- package/dist/sandbox/shell-escape.d.ts +8 -0
- package/dist/scorers.d.ts +19 -1
- package/dist/skills.cjs +282 -0
- package/dist/skills.cjs.map +1 -0
- package/dist/skills.d.cts +19 -0
- package/dist/skills.d.ts +19 -0
- package/dist/skills.js +279 -0
- package/dist/skills.js.map +1 -0
- package/dist/subagents.cjs +24 -0
- package/dist/subagents.cjs.map +1 -0
- package/dist/subagents.d.cts +14 -0
- package/dist/subagents.d.ts +14 -0
- package/dist/subagents.js +21 -0
- package/dist/subagents.js.map +1 -0
- package/dist/types/agent.d.ts +8 -0
- package/dist/types/eval.d.ts +71 -0
- package/package.json +74 -14
package/dist/eval.cjs
CHANGED
|
@@ -6499,6 +6499,8 @@ function parseSubagentMarkdown(raw, filename) {
|
|
|
6499
6499
|
if (fields.model !== void 0) {
|
|
6500
6500
|
definition.model = fields.model === "inherit" ? "inherit" : { id: fields.model };
|
|
6501
6501
|
}
|
|
6502
|
+
const tools = fields.tools?.split(/[\s,]+/).map((t) => t.trim()).filter((t) => t.length > 0);
|
|
6503
|
+
if (tools !== void 0 && tools.length > 0) definition.tools = tools;
|
|
6502
6504
|
return { name, definition };
|
|
6503
6505
|
}
|
|
6504
6506
|
function splitFrontmatter2(raw, filename) {
|
|
@@ -6662,21 +6664,24 @@ ${lines.join("\n")}
|
|
|
6662
6664
|
}
|
|
6663
6665
|
};
|
|
6664
6666
|
|
|
6667
|
+
// src/internal/runtime/skills/skills-block.ts
|
|
6668
|
+
function buildSkillsBlock(skills) {
|
|
6669
|
+
if (skills.length === 0) return void 0;
|
|
6670
|
+
const lines = skills.map(
|
|
6671
|
+
(skill) => ` - ${escapeBlockBody(skill.name)}: ${escapeBlockBody(skill.description)}`
|
|
6672
|
+
);
|
|
6673
|
+
return `<skills>
|
|
6674
|
+
${lines.join("\n")}
|
|
6675
|
+
</skills>`;
|
|
6676
|
+
}
|
|
6677
|
+
|
|
6665
6678
|
// src/internal/runtime/system-prompt/sources/skills-provider.ts
|
|
6666
6679
|
var SkillsPromptProvider = class {
|
|
6667
6680
|
id = "skills";
|
|
6668
6681
|
priority = 20;
|
|
6669
6682
|
contribute(ctx) {
|
|
6670
6683
|
if (ctx.skillsAutoInject === false) return Promise.resolve(void 0);
|
|
6671
|
-
|
|
6672
|
-
const lines = ctx.skills.map((skill) => {
|
|
6673
|
-
const name = escapeBlockBody(skill.name);
|
|
6674
|
-
const description = escapeBlockBody(skill.description);
|
|
6675
|
-
return ` - ${name}: ${description}`;
|
|
6676
|
-
});
|
|
6677
|
-
return Promise.resolve(`<skills>
|
|
6678
|
-
${lines.join("\n")}
|
|
6679
|
-
</skills>`);
|
|
6684
|
+
return Promise.resolve(buildSkillsBlock(ctx.skills));
|
|
6680
6685
|
}
|
|
6681
6686
|
};
|
|
6682
6687
|
|
|
@@ -7784,7 +7789,7 @@ async function loadPluginManifestFromMarkdown(pluginsRoot, folderName) {
|
|
|
7784
7789
|
return metadata;
|
|
7785
7790
|
}
|
|
7786
7791
|
|
|
7787
|
-
// src/internal/runtime/skills/skills
|
|
7792
|
+
// src/internal/runtime/skills/discover-skills.ts
|
|
7788
7793
|
init_errors();
|
|
7789
7794
|
|
|
7790
7795
|
// src/internal/runtime/skills/skill-frontmatter.ts
|
|
@@ -7856,6 +7861,61 @@ function hasContent(value) {
|
|
|
7856
7861
|
return value !== void 0 && value.trim().length > 0;
|
|
7857
7862
|
}
|
|
7858
7863
|
|
|
7864
|
+
// src/internal/runtime/skills/discover-skills.ts
|
|
7865
|
+
async function discoverSkills(dir, options) {
|
|
7866
|
+
let entries;
|
|
7867
|
+
try {
|
|
7868
|
+
entries = await readWorkspaceDir(dir, "skills_read_error", "skills directory");
|
|
7869
|
+
} catch {
|
|
7870
|
+
return [];
|
|
7871
|
+
}
|
|
7872
|
+
const skills = [];
|
|
7873
|
+
for (const entry of entries) {
|
|
7874
|
+
if (!entry.isDirectory()) continue;
|
|
7875
|
+
let skillDir;
|
|
7876
|
+
try {
|
|
7877
|
+
skillDir = safePathJoin(dir, entry.name);
|
|
7878
|
+
assertNoSymlinkEscape(skillDir, dir);
|
|
7879
|
+
} catch {
|
|
7880
|
+
continue;
|
|
7881
|
+
}
|
|
7882
|
+
const skillPath = path.join(skillDir, "SKILL.md");
|
|
7883
|
+
let raw;
|
|
7884
|
+
try {
|
|
7885
|
+
raw = await promises.readFile(skillPath, "utf8");
|
|
7886
|
+
} catch {
|
|
7887
|
+
continue;
|
|
7888
|
+
}
|
|
7889
|
+
const skill = tryParseSkill(raw, entry.name, skillPath, options);
|
|
7890
|
+
if (skill !== void 0) skills.push(skill);
|
|
7891
|
+
}
|
|
7892
|
+
return skills;
|
|
7893
|
+
}
|
|
7894
|
+
function tryParseSkill(raw, fallbackName, source, options) {
|
|
7895
|
+
try {
|
|
7896
|
+
const frontmatter = parseSkillFrontmatter(raw, fallbackName);
|
|
7897
|
+
const skill = {
|
|
7898
|
+
name: frontmatter.name,
|
|
7899
|
+
description: frontmatter.description,
|
|
7900
|
+
source
|
|
7901
|
+
};
|
|
7902
|
+
if (frontmatter.category !== void 0) skill.category = frontmatter.category;
|
|
7903
|
+
if (frontmatter.dependencies !== void 0) skill.dependencies = frontmatter.dependencies;
|
|
7904
|
+
return skill;
|
|
7905
|
+
} catch (cause) {
|
|
7906
|
+
if (cause instanceof ConfigurationError) {
|
|
7907
|
+
options?.onInvalidSkill?.({
|
|
7908
|
+
name: fallbackName,
|
|
7909
|
+
source,
|
|
7910
|
+
code: cause.code ?? "unknown",
|
|
7911
|
+
message: cause.message
|
|
7912
|
+
});
|
|
7913
|
+
return void 0;
|
|
7914
|
+
}
|
|
7915
|
+
throw cause;
|
|
7916
|
+
}
|
|
7917
|
+
}
|
|
7918
|
+
|
|
7859
7919
|
// src/internal/runtime/skills/skills-manager.ts
|
|
7860
7920
|
var SkillsManager = class {
|
|
7861
7921
|
constructor(cwd, _enabled, settingSourcesIncludeProject) {
|
|
@@ -7873,56 +7933,20 @@ var SkillsManager = class {
|
|
|
7873
7933
|
await this.refresh();
|
|
7874
7934
|
}
|
|
7875
7935
|
async refresh() {
|
|
7876
|
-
this.skills = [];
|
|
7877
7936
|
const skillsRoot = path.join(this.cwd, ".theokit", "skills");
|
|
7878
|
-
|
|
7879
|
-
|
|
7880
|
-
|
|
7881
|
-
|
|
7882
|
-
|
|
7883
|
-
|
|
7884
|
-
assertNoSymlinkEscape(skillDir, skillsRoot);
|
|
7885
|
-
} catch {
|
|
7886
|
-
continue;
|
|
7887
|
-
}
|
|
7888
|
-
const skillPath = path.join(skillDir, "SKILL.md");
|
|
7889
|
-
let raw;
|
|
7890
|
-
try {
|
|
7891
|
-
raw = await promises.readFile(skillPath, "utf8");
|
|
7892
|
-
} catch {
|
|
7893
|
-
continue;
|
|
7937
|
+
this.skills = await discoverSkills(skillsRoot, {
|
|
7938
|
+
onInvalidSkill: (info) => {
|
|
7939
|
+
process.stderr.write(
|
|
7940
|
+
`[theokit-sdk] skill ${info.name} skipped (${info.code}): ${info.message}
|
|
7941
|
+
`
|
|
7942
|
+
);
|
|
7894
7943
|
}
|
|
7895
|
-
|
|
7896
|
-
if (metadata !== void 0) this.skills.push(metadata);
|
|
7897
|
-
}
|
|
7944
|
+
});
|
|
7898
7945
|
}
|
|
7899
7946
|
list() {
|
|
7900
7947
|
return Promise.resolve(this.skills);
|
|
7901
7948
|
}
|
|
7902
7949
|
};
|
|
7903
|
-
function tryParseSkill(raw, fallbackName, source) {
|
|
7904
|
-
try {
|
|
7905
|
-
const frontmatter = parseSkillFrontmatter(raw, fallbackName);
|
|
7906
|
-
const metadata = {
|
|
7907
|
-
name: frontmatter.name,
|
|
7908
|
-
description: frontmatter.description,
|
|
7909
|
-
source
|
|
7910
|
-
};
|
|
7911
|
-
if (frontmatter.category !== void 0) metadata.category = frontmatter.category;
|
|
7912
|
-
if (frontmatter.dependencies !== void 0) metadata.dependencies = frontmatter.dependencies;
|
|
7913
|
-
return metadata;
|
|
7914
|
-
} catch (cause) {
|
|
7915
|
-
if (cause instanceof ConfigurationError) {
|
|
7916
|
-
const code = cause.code ?? "unknown";
|
|
7917
|
-
process.stderr.write(
|
|
7918
|
-
`[theokit-sdk] skill ${fallbackName} skipped (${code}): ${cause.message}
|
|
7919
|
-
`
|
|
7920
|
-
);
|
|
7921
|
-
return void 0;
|
|
7922
|
-
}
|
|
7923
|
-
throw cause;
|
|
7924
|
-
}
|
|
7925
|
-
}
|
|
7926
7950
|
|
|
7927
7951
|
// src/internal/runtime/local-agent/local-agent-bootstrap.ts
|
|
7928
7952
|
function registerLocalAgent(args) {
|
|
@@ -8335,6 +8359,7 @@ async function initLoopContext(inputs) {
|
|
|
8335
8359
|
finalStatus: "finished",
|
|
8336
8360
|
usage: new UsageAccumulator(),
|
|
8337
8361
|
nudgeAttempts: 0,
|
|
8362
|
+
stopFeedbackAttempts: 0,
|
|
8338
8363
|
...memoryProviderHandle !== void 0 ? { memoryProviderHandle } : {},
|
|
8339
8364
|
...memorySystemPromptAdditions !== void 0 ? { memorySystemPromptAdditions } : {}
|
|
8340
8365
|
};
|
|
@@ -8479,8 +8504,9 @@ function registerLoopError(ctx, cause) {
|
|
|
8479
8504
|
if (ctx.error !== void 0) return;
|
|
8480
8505
|
const rawMessage = cause?.message;
|
|
8481
8506
|
const message = typeof rawMessage === "string" ? rawMessage : cause instanceof Error ? cause.message : String(cause);
|
|
8507
|
+
const metaCode = cause?.metadata?.code;
|
|
8482
8508
|
const rawCode = cause?.code;
|
|
8483
|
-
const code = typeof rawCode === "string" ? rawCode : void 0;
|
|
8509
|
+
const code = typeof metaCode === "string" ? metaCode : typeof rawCode === "string" ? rawCode : void 0;
|
|
8484
8510
|
ctx.error = code !== void 0 ? { message, code, cause } : { message, cause };
|
|
8485
8511
|
}
|
|
8486
8512
|
async function runCollectorLoop(generator, inputs, ctx) {
|
|
@@ -9276,6 +9302,7 @@ function computeUsageCost(inputs, usage) {
|
|
|
9276
9302
|
|
|
9277
9303
|
// src/internal/agent-loop/loop.ts
|
|
9278
9304
|
var MAX_NUDGE_ATTEMPTS = 2;
|
|
9305
|
+
var MAX_STOP_FEEDBACK_ATTEMPTS = 2;
|
|
9279
9306
|
async function runAgentLoop(inputs) {
|
|
9280
9307
|
const sendSpan = inputs.telemetry?.startSpan("agent.send", {
|
|
9281
9308
|
agentId: inputs.agentId,
|
|
@@ -9433,6 +9460,28 @@ function shouldNudgeAndContinue(ctx, llmOutput) {
|
|
|
9433
9460
|
});
|
|
9434
9461
|
return true;
|
|
9435
9462
|
}
|
|
9463
|
+
async function reflectAfterStop(inputs, ctx) {
|
|
9464
|
+
const result = await inputs.hooks.run({
|
|
9465
|
+
event: "stop",
|
|
9466
|
+
agentId: inputs.agentId,
|
|
9467
|
+
runId: inputs.runId
|
|
9468
|
+
});
|
|
9469
|
+
if (result.blocked) return false;
|
|
9470
|
+
if (ctx.stopFeedbackAttempts >= MAX_STOP_FEEDBACK_ATTEMPTS) return false;
|
|
9471
|
+
const feedback = result.decisions.find(
|
|
9472
|
+
(d) => d.decision === "feedback" && (d.feedback ?? "").length > 0
|
|
9473
|
+
)?.feedback;
|
|
9474
|
+
if (feedback === void 0) return false;
|
|
9475
|
+
ctx.stopFeedbackAttempts += 1;
|
|
9476
|
+
ctx.messages.push({ role: "user", content: [{ type: "text", text: feedback }] });
|
|
9477
|
+
return true;
|
|
9478
|
+
}
|
|
9479
|
+
async function finishOrReflect(inputs, ctx, llmOutput) {
|
|
9480
|
+
if (shouldNudgeAndContinue(ctx, llmOutput)) return "continue";
|
|
9481
|
+
if (await reflectAfterStop(inputs, ctx)) return "continue";
|
|
9482
|
+
ctx.finalStatus = "finished";
|
|
9483
|
+
return "done";
|
|
9484
|
+
}
|
|
9436
9485
|
async function runIteration(inputs, ctx) {
|
|
9437
9486
|
const llmOutput = await streamLlmTurn(inputs, ctx);
|
|
9438
9487
|
accumulateUsage(ctx.usage, llmOutput);
|
|
@@ -9466,9 +9515,7 @@ async function continueOrTerminate(inputs, ctx, llmOutput) {
|
|
|
9466
9515
|
await emitAssistantTextStep(inputs, ctx, llmOutput.text);
|
|
9467
9516
|
}
|
|
9468
9517
|
if (llmOutput.stopReason !== "tool_use" || llmOutput.toolCalls.length === 0) {
|
|
9469
|
-
|
|
9470
|
-
ctx.finalStatus = "finished";
|
|
9471
|
-
return "done";
|
|
9518
|
+
return finishOrReflect(inputs, ctx, llmOutput);
|
|
9472
9519
|
}
|
|
9473
9520
|
ctx.messages.push(buildAssistantTurn(llmOutput.text, llmOutput.toolCalls));
|
|
9474
9521
|
const toolResults = await dispatchTools(inputs, ctx.tools, llmOutput.toolCalls, ctx.events);
|
|
@@ -15491,6 +15538,69 @@ setAgentFacade({
|
|
|
15491
15538
|
resume: (agentId, options) => Agent.resume(agentId, options),
|
|
15492
15539
|
batch: (prompts, options) => Agent.batch(prompts, options)
|
|
15493
15540
|
});
|
|
15541
|
+
var JsonlParseError = class extends Error {
|
|
15542
|
+
constructor(message, line) {
|
|
15543
|
+
super(message);
|
|
15544
|
+
this.line = line;
|
|
15545
|
+
this.name = "JsonlParseError";
|
|
15546
|
+
}
|
|
15547
|
+
line;
|
|
15548
|
+
};
|
|
15549
|
+
function isPlainObject(value) {
|
|
15550
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
15551
|
+
}
|
|
15552
|
+
function tryParseObjectLine(line) {
|
|
15553
|
+
if (line.length === 0) return void 0;
|
|
15554
|
+
let parsed;
|
|
15555
|
+
try {
|
|
15556
|
+
parsed = JSON.parse(line);
|
|
15557
|
+
} catch {
|
|
15558
|
+
return void 0;
|
|
15559
|
+
}
|
|
15560
|
+
return isPlainObject(parsed) ? parsed : void 0;
|
|
15561
|
+
}
|
|
15562
|
+
function loadJsonl(path, opts = {}) {
|
|
15563
|
+
const text = fs.readFileSync(path, "utf8");
|
|
15564
|
+
const out = [];
|
|
15565
|
+
let lineNumber = 0;
|
|
15566
|
+
for (const rawLine of text.split("\n")) {
|
|
15567
|
+
lineNumber += 1;
|
|
15568
|
+
const line = rawLine.trim();
|
|
15569
|
+
if (line.length === 0) continue;
|
|
15570
|
+
let parsed;
|
|
15571
|
+
try {
|
|
15572
|
+
parsed = JSON.parse(line);
|
|
15573
|
+
} catch {
|
|
15574
|
+
throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
|
|
15575
|
+
}
|
|
15576
|
+
if (!isPlainObject(parsed)) {
|
|
15577
|
+
throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
|
|
15578
|
+
}
|
|
15579
|
+
out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
|
|
15580
|
+
}
|
|
15581
|
+
return out;
|
|
15582
|
+
}
|
|
15583
|
+
function appendJsonl(path$1, record) {
|
|
15584
|
+
fs.mkdirSync(path.dirname(path$1), { recursive: true });
|
|
15585
|
+
fs.appendFileSync(path$1, `${JSON.stringify(record)}
|
|
15586
|
+
`);
|
|
15587
|
+
}
|
|
15588
|
+
function readJsonlIds(path, keyFn) {
|
|
15589
|
+
const done = /* @__PURE__ */ new Set();
|
|
15590
|
+
let text;
|
|
15591
|
+
try {
|
|
15592
|
+
text = fs.readFileSync(path, "utf8");
|
|
15593
|
+
} catch {
|
|
15594
|
+
return done;
|
|
15595
|
+
}
|
|
15596
|
+
for (const rawLine of text.split("\n")) {
|
|
15597
|
+
const parsed = tryParseObjectLine(rawLine.trim());
|
|
15598
|
+
if (parsed === void 0) continue;
|
|
15599
|
+
const key = keyFn(parsed);
|
|
15600
|
+
if (typeof key === "string" && key.length > 0) done.add(key);
|
|
15601
|
+
}
|
|
15602
|
+
return done;
|
|
15603
|
+
}
|
|
15494
15604
|
|
|
15495
15605
|
// src/internal/eval/runner.ts
|
|
15496
15606
|
init_agent_factory_registry();
|
|
@@ -15682,6 +15792,50 @@ function normalizeScorers(input) {
|
|
|
15682
15792
|
return { name: s.name, score: s.score };
|
|
15683
15793
|
});
|
|
15684
15794
|
}
|
|
15795
|
+
function probeRow(entry, index) {
|
|
15796
|
+
return {
|
|
15797
|
+
index,
|
|
15798
|
+
input: entry.input,
|
|
15799
|
+
output: "",
|
|
15800
|
+
...entry.expected !== void 0 ? { expected: entry.expected } : {},
|
|
15801
|
+
scores: [],
|
|
15802
|
+
meanScore: 0,
|
|
15803
|
+
durationMs: 0,
|
|
15804
|
+
...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
|
|
15805
|
+
};
|
|
15806
|
+
}
|
|
15807
|
+
function computeDoneKeys(persist) {
|
|
15808
|
+
if (persist.resume !== true) return /* @__PURE__ */ new Set();
|
|
15809
|
+
return readJsonlIds(
|
|
15810
|
+
persist.path,
|
|
15811
|
+
(parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
|
|
15812
|
+
);
|
|
15813
|
+
}
|
|
15814
|
+
function appendRowSafely(path, row) {
|
|
15815
|
+
try {
|
|
15816
|
+
appendJsonl(path, row);
|
|
15817
|
+
} catch (err) {
|
|
15818
|
+
console.warn(
|
|
15819
|
+
"[eval] persist append failed (ignored):",
|
|
15820
|
+
err instanceof Error ? err.message : err
|
|
15821
|
+
);
|
|
15822
|
+
}
|
|
15823
|
+
}
|
|
15824
|
+
function makeRowSink(persist, classify) {
|
|
15825
|
+
const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
|
|
15826
|
+
return {
|
|
15827
|
+
isResumed(entry, index) {
|
|
15828
|
+
if (persist === void 0 || doneKeys.size === 0) return false;
|
|
15829
|
+
return doneKeys.has(persist.key(probeRow(entry, index)));
|
|
15830
|
+
},
|
|
15831
|
+
finalize(row) {
|
|
15832
|
+
const outcome = classify?.(row);
|
|
15833
|
+
const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
|
|
15834
|
+
if (persist !== void 0) appendRowSafely(persist.path, finalRow);
|
|
15835
|
+
return finalRow;
|
|
15836
|
+
}
|
|
15837
|
+
};
|
|
15838
|
+
}
|
|
15685
15839
|
async function applyScorer(scorer, output, expected) {
|
|
15686
15840
|
let raw;
|
|
15687
15841
|
try {
|
|
@@ -15727,7 +15881,15 @@ function makeAgentForBatch(spec, _entries) {
|
|
|
15727
15881
|
}
|
|
15728
15882
|
return spec;
|
|
15729
15883
|
}
|
|
15730
|
-
async function
|
|
15884
|
+
async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
|
|
15885
|
+
const entry = entries[idx];
|
|
15886
|
+
if (entry === void 0) return;
|
|
15887
|
+
if (sink.isResumed(entry, idx)) return;
|
|
15888
|
+
const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
|
|
15889
|
+
rows[idx] = row;
|
|
15890
|
+
onRow(row, idx);
|
|
15891
|
+
}
|
|
15892
|
+
async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
|
|
15731
15893
|
const rows = new Array(entries.length);
|
|
15732
15894
|
const state2 = { cursor: 0 };
|
|
15733
15895
|
const worker = async () => {
|
|
@@ -15735,11 +15897,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
|
|
|
15735
15897
|
if (signal?.aborted === true) return;
|
|
15736
15898
|
const idx = state2.cursor;
|
|
15737
15899
|
state2.cursor += 1;
|
|
15738
|
-
|
|
15739
|
-
if (entry === void 0) continue;
|
|
15740
|
-
const row = await runOneEntry(spec, entry, idx, scorers);
|
|
15741
|
-
rows[idx] = row;
|
|
15742
|
-
onRow(row, idx);
|
|
15900
|
+
await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
|
|
15743
15901
|
}
|
|
15744
15902
|
};
|
|
15745
15903
|
const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
|
|
@@ -15789,23 +15947,32 @@ async function scoreBatchOutput(br, expected, scorers) {
|
|
|
15789
15947
|
}
|
|
15790
15948
|
return scoreEntries;
|
|
15791
15949
|
}
|
|
15792
|
-
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
|
|
15793
|
-
const
|
|
15950
|
+
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
|
|
15951
|
+
const pending = [];
|
|
15952
|
+
for (let i = 0; i < entries.length; i += 1) {
|
|
15953
|
+
const entry = entries[i];
|
|
15954
|
+
if (entry === void 0) continue;
|
|
15955
|
+
if (sink.isResumed(entry, i)) continue;
|
|
15956
|
+
pending.push({ entry, index: i });
|
|
15957
|
+
}
|
|
15794
15958
|
const batchOpts = {
|
|
15795
15959
|
...agentOptions,
|
|
15796
15960
|
concurrency,
|
|
15797
15961
|
...signal !== void 0 ? { signal } : {}
|
|
15798
15962
|
};
|
|
15799
|
-
const batchResults = await getAgentFacade().batch(
|
|
15963
|
+
const batchResults = await getAgentFacade().batch(
|
|
15964
|
+
pending.map((p) => p.entry.input),
|
|
15965
|
+
batchOpts
|
|
15966
|
+
);
|
|
15800
15967
|
const rows = [];
|
|
15801
15968
|
for (let i = 0; i < batchResults.length; i += 1) {
|
|
15802
|
-
const
|
|
15969
|
+
const slot = pending[i];
|
|
15803
15970
|
const br = batchResults[i];
|
|
15804
|
-
if (
|
|
15805
|
-
const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
|
|
15806
|
-
const row = rowFromBatchResult(entry, br, scoreEntries,
|
|
15971
|
+
if (slot === void 0 || br === void 0) continue;
|
|
15972
|
+
const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
|
|
15973
|
+
const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
|
|
15807
15974
|
rows.push(row);
|
|
15808
|
-
onRow(row,
|
|
15975
|
+
onRow(row, slot.index);
|
|
15809
15976
|
}
|
|
15810
15977
|
return rows;
|
|
15811
15978
|
}
|
|
@@ -15830,6 +15997,7 @@ async function runEval(options, runOpts) {
|
|
|
15830
15997
|
const onRow = (row, i) => {
|
|
15831
15998
|
safeHook(() => hooks?.afterRow?.(row, i));
|
|
15832
15999
|
};
|
|
16000
|
+
const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
|
|
15833
16001
|
let rows;
|
|
15834
16002
|
if (isAgentInstance(options.agent) || typeof options.agent === "function") {
|
|
15835
16003
|
rows = await runRowsManually(
|
|
@@ -15838,11 +16006,12 @@ async function runEval(options, runOpts) {
|
|
|
15838
16006
|
scorers,
|
|
15839
16007
|
concurrency,
|
|
15840
16008
|
signal,
|
|
15841
|
-
onRow
|
|
16009
|
+
onRow,
|
|
16010
|
+
sink
|
|
15842
16011
|
);
|
|
15843
16012
|
} else {
|
|
15844
16013
|
const batchOpts = makeAgentForBatch(options.agent, indexed);
|
|
15845
|
-
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
|
|
16014
|
+
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
|
|
15846
16015
|
}
|
|
15847
16016
|
const aggregate = computeAggregate(rows);
|
|
15848
16017
|
const endedAt = Date.now();
|
|
@@ -15868,6 +16037,25 @@ async function runEval(options, runOpts) {
|
|
|
15868
16037
|
}
|
|
15869
16038
|
}
|
|
15870
16039
|
|
|
16040
|
+
// src/sandbox/shell-escape.ts
|
|
16041
|
+
function shellEscapePosix(arg) {
|
|
16042
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
16043
|
+
}
|
|
16044
|
+
|
|
16045
|
+
// src/internal/eval/code-runner.ts
|
|
16046
|
+
var ARTIFACT_PATCH = ".theo-artifact.patch";
|
|
16047
|
+
async function captureArtifact(sandbox, repoDir) {
|
|
16048
|
+
const dir = shellEscapePosix(repoDir);
|
|
16049
|
+
const diffRes = await sandbox.execute(`git -C ${dir} diff`);
|
|
16050
|
+
const diff = diffRes.stdout;
|
|
16051
|
+
if (diff.length === 0) return { diff: "", applies: false };
|
|
16052
|
+
await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
|
|
16053
|
+
const check = await sandbox.execute(
|
|
16054
|
+
`git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
|
|
16055
|
+
);
|
|
16056
|
+
return { diff, applies: check.exitCode === 0 };
|
|
16057
|
+
}
|
|
16058
|
+
|
|
15871
16059
|
// src/internal/scorers/llm-judge.ts
|
|
15872
16060
|
init_agent_factory_registry();
|
|
15873
16061
|
function buildPrompt(subject, criteria, rubric, expected) {
|
|
@@ -16010,6 +16198,38 @@ var Scorers = {
|
|
|
16010
16198
|
}
|
|
16011
16199
|
};
|
|
16012
16200
|
},
|
|
16201
|
+
/**
|
|
16202
|
+
* Verify-gate scorer (M6-2): runs the project's tests in the provisioned
|
|
16203
|
+
* repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
|
|
16204
|
+
* else `0` with the exit code + truncated stderr in `reason`. Grades the
|
|
16205
|
+
* artifact captured by `captureArtifact` (D2 — rides `execute`, never a
|
|
16206
|
+
* direct `child_process`).
|
|
16207
|
+
*
|
|
16208
|
+
* SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
|
|
16209
|
+
* of the (potentially untrusted, dataset-derived) test identifiers. There is
|
|
16210
|
+
* NO default that runs bare test names — that would interpolate untrusted
|
|
16211
|
+
* `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
|
|
16212
|
+
* by the SDK; the test list is the builder's responsibility to render safely.
|
|
16213
|
+
*
|
|
16214
|
+
* PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
|
|
16215
|
+
* assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
|
|
16216
|
+
* that rejects shell metacharacters in `execute` is unsupported for this scorer.
|
|
16217
|
+
*/
|
|
16218
|
+
verifyGate(opts) {
|
|
16219
|
+
const { sandbox, repoDir, failToPass, passToPass, command } = opts;
|
|
16220
|
+
return {
|
|
16221
|
+
name: "verify-gate",
|
|
16222
|
+
score: async () => {
|
|
16223
|
+
const cmd = command([...failToPass, ...passToPass]).trim();
|
|
16224
|
+
if (cmd.length === 0) {
|
|
16225
|
+
return { score: 0, reason: "verify_gate_empty_command" };
|
|
16226
|
+
}
|
|
16227
|
+
const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
|
|
16228
|
+
if (r.exitCode === 0) return { score: 1 };
|
|
16229
|
+
return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
|
|
16230
|
+
}
|
|
16231
|
+
};
|
|
16232
|
+
},
|
|
16013
16233
|
jsonShape(schema, opts = {}) {
|
|
16014
16234
|
return {
|
|
16015
16235
|
name: "json-shape",
|
|
@@ -16084,6 +16304,9 @@ var Eval = class _Eval {
|
|
|
16084
16304
|
|
|
16085
16305
|
exports.Eval = Eval;
|
|
16086
16306
|
exports.EvalAlreadyRunningError = EvalAlreadyRunningError;
|
|
16307
|
+
exports.JsonlParseError = JsonlParseError;
|
|
16087
16308
|
exports.Scorers = Scorers;
|
|
16309
|
+
exports.captureArtifact = captureArtifact;
|
|
16310
|
+
exports.loadJsonl = loadJsonl;
|
|
16088
16311
|
//# sourceMappingURL=eval.cjs.map
|
|
16089
16312
|
//# sourceMappingURL=eval.cjs.map
|