@theokit/sdk 2.3.0 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +113 -0
- package/dist/a2a/index.cjs +103 -48
- package/dist/a2a/index.cjs.map +1 -1
- package/dist/a2a/index.js +104 -49
- package/dist/a2a/index.js.map +1 -1
- package/dist/compaction.cjs +78 -0
- package/dist/compaction.cjs.map +1 -0
- package/dist/compaction.d.cts +76 -0
- package/dist/compaction.d.ts +76 -0
- package/dist/compaction.js +70 -0
- package/dist/compaction.js.map +1 -0
- package/dist/{cron-B_H8rn-j.d.cts → cron-B656C3iq.d.cts} +8 -0
- package/dist/{cron-DX6HbHxd.d.ts → cron-CM2M9mhB.d.ts} +8 -0
- package/dist/cron.cjs +104 -57
- package/dist/cron.cjs.map +1 -1
- package/dist/cron.d.cts +1 -1
- package/dist/cron.d.ts +1 -1
- package/dist/cron.js +104 -57
- package/dist/cron.js.map +1 -1
- package/dist/eval.cjs +296 -73
- package/dist/eval.cjs.map +1 -1
- package/dist/eval.d.cts +2 -0
- package/dist/eval.d.ts +2 -0
- package/dist/eval.js +295 -75
- package/dist/eval.js.map +1 -1
- package/dist/index.cjs +135 -65
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +42 -7
- package/dist/index.d.ts +42 -7
- package/dist/index.js +135 -66
- package/dist/index.js.map +1 -1
- package/dist/internal/agent-loop/loop.d.ts +5 -0
- package/dist/internal/eval/code-runner.d.ts +28 -0
- package/dist/internal/llm/model-capabilities.d.ts +40 -0
- package/dist/internal/llm/model-identifier.d.ts +9 -1
- package/dist/internal/llm/model-option.d.ts +38 -0
- package/dist/internal/persistence/index.cjs +68 -0
- package/dist/internal/persistence/index.cjs.map +1 -1
- package/dist/internal/persistence/index.d.cts +1 -0
- package/dist/internal/persistence/index.d.ts +1 -0
- package/dist/internal/persistence/index.js +65 -1
- package/dist/internal/persistence/index.js.map +1 -1
- package/dist/internal/persistence/jsonl.d.cts +34 -0
- package/dist/internal/persistence/jsonl.d.ts +34 -0
- package/dist/internal/runtime/compression/compression-attempt.d.ts +24 -0
- package/dist/internal/runtime/compression/compression-config.d.ts +33 -0
- package/dist/internal/runtime/compression/compression-decision.d.ts +10 -0
- package/dist/internal/runtime/compression/compression-helpers.d.ts +18 -0
- package/dist/internal/runtime/compression/compression-model-registry.d.ts +41 -0
- package/dist/internal/runtime/compression/compression-summarizer.d.ts +29 -0
- package/dist/internal/runtime/context/project-instructions.d.ts +66 -0
- package/dist/internal/runtime/context/replay-history.d.ts +43 -0
- package/dist/internal/runtime/hooks/hooks-frontmatter.d.ts +1 -1
- package/dist/internal/runtime/skills/discover-skills.d.ts +68 -0
- package/dist/internal/runtime/skills/skills-block.d.ts +18 -0
- package/dist/internal/runtime/skills/subagent-tool-scope.d.ts +25 -0
- package/dist/messages.cjs +24 -0
- package/dist/messages.cjs.map +1 -0
- package/dist/messages.d.cts +33 -0
- package/dist/messages.d.ts +33 -0
- package/dist/messages.js +20 -0
- package/dist/messages.js.map +1 -0
- package/dist/models.cjs +233 -0
- package/dist/models.cjs.map +1 -0
- package/dist/models.d.cts +16 -0
- package/dist/models.d.ts +16 -0
- package/dist/models.js +228 -0
- package/dist/models.js.map +1 -0
- package/dist/permission-engine.d.ts +12 -4
- package/dist/project.cjs +149 -0
- package/dist/project.cjs.map +1 -0
- package/dist/project.d.cts +14 -0
- package/dist/project.d.ts +14 -0
- package/dist/project.js +146 -0
- package/dist/project.js.map +1 -0
- package/dist/sandbox/index.cjs +71 -1
- package/dist/sandbox/index.cjs.map +1 -1
- package/dist/sandbox/index.d.cts +1 -0
- package/dist/sandbox/index.d.ts +1 -0
- package/dist/sandbox/index.js +70 -2
- package/dist/sandbox/index.js.map +1 -1
- package/dist/sandbox/provision.d.cts +53 -0
- package/dist/sandbox/provision.d.ts +53 -0
- package/dist/sandbox/shell-escape.d.cts +8 -0
- package/dist/sandbox/shell-escape.d.ts +8 -0
- package/dist/scorers.d.ts +19 -1
- package/dist/skills.cjs +282 -0
- package/dist/skills.cjs.map +1 -0
- package/dist/skills.d.cts +19 -0
- package/dist/skills.d.ts +19 -0
- package/dist/skills.js +279 -0
- package/dist/skills.js.map +1 -0
- package/dist/subagents.cjs +24 -0
- package/dist/subagents.cjs.map +1 -0
- package/dist/subagents.d.cts +14 -0
- package/dist/subagents.d.ts +14 -0
- package/dist/subagents.js +21 -0
- package/dist/subagents.js.map +1 -0
- package/dist/types/agent.d.ts +8 -0
- package/dist/types/eval.d.ts +71 -0
- package/package.json +74 -14
package/dist/eval.d.cts
CHANGED
|
@@ -31,6 +31,8 @@ export declare class Eval {
|
|
|
31
31
|
*/
|
|
32
32
|
run(runOpts?: EvalRunOptions): Promise<EvalRun>;
|
|
33
33
|
}
|
|
34
|
+
export { captureArtifact } from "./internal/eval/code-runner.js";
|
|
34
35
|
export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
|
|
36
|
+
export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
|
|
35
37
|
export { Scorers } from "./scorers.js";
|
|
36
38
|
export type * from "./types/eval.js";
|
package/dist/eval.d.ts
CHANGED
|
@@ -31,6 +31,8 @@ export declare class Eval {
|
|
|
31
31
|
*/
|
|
32
32
|
run(runOpts?: EvalRunOptions): Promise<EvalRun>;
|
|
33
33
|
}
|
|
34
|
+
export { captureArtifact } from "./internal/eval/code-runner.js";
|
|
34
35
|
export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
|
|
36
|
+
export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
|
|
35
37
|
export { Scorers } from "./scorers.js";
|
|
36
38
|
export type * from "./types/eval.js";
|
package/dist/eval.js
CHANGED
|
@@ -2,7 +2,7 @@ import { randomUUID, randomBytes, createHash } from 'crypto';
|
|
|
2
2
|
import { readFile, unlink, mkdir, open, rename, statfs, stat, rm, readdir, appendFile, access } from 'fs/promises';
|
|
3
3
|
import { join, dirname, resolve, sep, relative, isAbsolute } from 'path';
|
|
4
4
|
import { z, toJSONSchema } from 'zod';
|
|
5
|
-
import { mkdirSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync
|
|
5
|
+
import { readFileSync, mkdirSync, appendFileSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync } from 'fs';
|
|
6
6
|
import { AsyncLocalStorage } from 'async_hooks';
|
|
7
7
|
import { createRequire } from 'module';
|
|
8
8
|
import { homedir } from 'os';
|
|
@@ -6496,6 +6496,8 @@ function parseSubagentMarkdown(raw, filename) {
|
|
|
6496
6496
|
if (fields.model !== void 0) {
|
|
6497
6497
|
definition.model = fields.model === "inherit" ? "inherit" : { id: fields.model };
|
|
6498
6498
|
}
|
|
6499
|
+
const tools = fields.tools?.split(/[\s,]+/).map((t) => t.trim()).filter((t) => t.length > 0);
|
|
6500
|
+
if (tools !== void 0 && tools.length > 0) definition.tools = tools;
|
|
6499
6501
|
return { name, definition };
|
|
6500
6502
|
}
|
|
6501
6503
|
function splitFrontmatter2(raw, filename) {
|
|
@@ -6659,21 +6661,24 @@ ${lines.join("\n")}
|
|
|
6659
6661
|
}
|
|
6660
6662
|
};
|
|
6661
6663
|
|
|
6664
|
+
// src/internal/runtime/skills/skills-block.ts
|
|
6665
|
+
function buildSkillsBlock(skills) {
|
|
6666
|
+
if (skills.length === 0) return void 0;
|
|
6667
|
+
const lines = skills.map(
|
|
6668
|
+
(skill) => ` - ${escapeBlockBody(skill.name)}: ${escapeBlockBody(skill.description)}`
|
|
6669
|
+
);
|
|
6670
|
+
return `<skills>
|
|
6671
|
+
${lines.join("\n")}
|
|
6672
|
+
</skills>`;
|
|
6673
|
+
}
|
|
6674
|
+
|
|
6662
6675
|
// src/internal/runtime/system-prompt/sources/skills-provider.ts
|
|
6663
6676
|
var SkillsPromptProvider = class {
|
|
6664
6677
|
id = "skills";
|
|
6665
6678
|
priority = 20;
|
|
6666
6679
|
contribute(ctx) {
|
|
6667
6680
|
if (ctx.skillsAutoInject === false) return Promise.resolve(void 0);
|
|
6668
|
-
|
|
6669
|
-
const lines = ctx.skills.map((skill) => {
|
|
6670
|
-
const name = escapeBlockBody(skill.name);
|
|
6671
|
-
const description = escapeBlockBody(skill.description);
|
|
6672
|
-
return ` - ${name}: ${description}`;
|
|
6673
|
-
});
|
|
6674
|
-
return Promise.resolve(`<skills>
|
|
6675
|
-
${lines.join("\n")}
|
|
6676
|
-
</skills>`);
|
|
6681
|
+
return Promise.resolve(buildSkillsBlock(ctx.skills));
|
|
6677
6682
|
}
|
|
6678
6683
|
};
|
|
6679
6684
|
|
|
@@ -7781,7 +7786,7 @@ async function loadPluginManifestFromMarkdown(pluginsRoot, folderName) {
|
|
|
7781
7786
|
return metadata;
|
|
7782
7787
|
}
|
|
7783
7788
|
|
|
7784
|
-
// src/internal/runtime/skills/skills
|
|
7789
|
+
// src/internal/runtime/skills/discover-skills.ts
|
|
7785
7790
|
init_errors();
|
|
7786
7791
|
|
|
7787
7792
|
// src/internal/runtime/skills/skill-frontmatter.ts
|
|
@@ -7853,6 +7858,61 @@ function hasContent(value) {
|
|
|
7853
7858
|
return value !== void 0 && value.trim().length > 0;
|
|
7854
7859
|
}
|
|
7855
7860
|
|
|
7861
|
+
// src/internal/runtime/skills/discover-skills.ts
|
|
7862
|
+
async function discoverSkills(dir, options) {
|
|
7863
|
+
let entries;
|
|
7864
|
+
try {
|
|
7865
|
+
entries = await readWorkspaceDir(dir, "skills_read_error", "skills directory");
|
|
7866
|
+
} catch {
|
|
7867
|
+
return [];
|
|
7868
|
+
}
|
|
7869
|
+
const skills = [];
|
|
7870
|
+
for (const entry of entries) {
|
|
7871
|
+
if (!entry.isDirectory()) continue;
|
|
7872
|
+
let skillDir;
|
|
7873
|
+
try {
|
|
7874
|
+
skillDir = safePathJoin(dir, entry.name);
|
|
7875
|
+
assertNoSymlinkEscape(skillDir, dir);
|
|
7876
|
+
} catch {
|
|
7877
|
+
continue;
|
|
7878
|
+
}
|
|
7879
|
+
const skillPath = join(skillDir, "SKILL.md");
|
|
7880
|
+
let raw;
|
|
7881
|
+
try {
|
|
7882
|
+
raw = await readFile(skillPath, "utf8");
|
|
7883
|
+
} catch {
|
|
7884
|
+
continue;
|
|
7885
|
+
}
|
|
7886
|
+
const skill = tryParseSkill(raw, entry.name, skillPath, options);
|
|
7887
|
+
if (skill !== void 0) skills.push(skill);
|
|
7888
|
+
}
|
|
7889
|
+
return skills;
|
|
7890
|
+
}
|
|
7891
|
+
function tryParseSkill(raw, fallbackName, source, options) {
|
|
7892
|
+
try {
|
|
7893
|
+
const frontmatter = parseSkillFrontmatter(raw, fallbackName);
|
|
7894
|
+
const skill = {
|
|
7895
|
+
name: frontmatter.name,
|
|
7896
|
+
description: frontmatter.description,
|
|
7897
|
+
source
|
|
7898
|
+
};
|
|
7899
|
+
if (frontmatter.category !== void 0) skill.category = frontmatter.category;
|
|
7900
|
+
if (frontmatter.dependencies !== void 0) skill.dependencies = frontmatter.dependencies;
|
|
7901
|
+
return skill;
|
|
7902
|
+
} catch (cause) {
|
|
7903
|
+
if (cause instanceof ConfigurationError) {
|
|
7904
|
+
options?.onInvalidSkill?.({
|
|
7905
|
+
name: fallbackName,
|
|
7906
|
+
source,
|
|
7907
|
+
code: cause.code ?? "unknown",
|
|
7908
|
+
message: cause.message
|
|
7909
|
+
});
|
|
7910
|
+
return void 0;
|
|
7911
|
+
}
|
|
7912
|
+
throw cause;
|
|
7913
|
+
}
|
|
7914
|
+
}
|
|
7915
|
+
|
|
7856
7916
|
// src/internal/runtime/skills/skills-manager.ts
|
|
7857
7917
|
var SkillsManager = class {
|
|
7858
7918
|
constructor(cwd, _enabled, settingSourcesIncludeProject) {
|
|
@@ -7870,56 +7930,20 @@ var SkillsManager = class {
|
|
|
7870
7930
|
await this.refresh();
|
|
7871
7931
|
}
|
|
7872
7932
|
async refresh() {
|
|
7873
|
-
this.skills = [];
|
|
7874
7933
|
const skillsRoot = join(this.cwd, ".theokit", "skills");
|
|
7875
|
-
|
|
7876
|
-
|
|
7877
|
-
|
|
7878
|
-
|
|
7879
|
-
|
|
7880
|
-
|
|
7881
|
-
assertNoSymlinkEscape(skillDir, skillsRoot);
|
|
7882
|
-
} catch {
|
|
7883
|
-
continue;
|
|
7884
|
-
}
|
|
7885
|
-
const skillPath = join(skillDir, "SKILL.md");
|
|
7886
|
-
let raw;
|
|
7887
|
-
try {
|
|
7888
|
-
raw = await readFile(skillPath, "utf8");
|
|
7889
|
-
} catch {
|
|
7890
|
-
continue;
|
|
7934
|
+
this.skills = await discoverSkills(skillsRoot, {
|
|
7935
|
+
onInvalidSkill: (info) => {
|
|
7936
|
+
process.stderr.write(
|
|
7937
|
+
`[theokit-sdk] skill ${info.name} skipped (${info.code}): ${info.message}
|
|
7938
|
+
`
|
|
7939
|
+
);
|
|
7891
7940
|
}
|
|
7892
|
-
|
|
7893
|
-
if (metadata !== void 0) this.skills.push(metadata);
|
|
7894
|
-
}
|
|
7941
|
+
});
|
|
7895
7942
|
}
|
|
7896
7943
|
list() {
|
|
7897
7944
|
return Promise.resolve(this.skills);
|
|
7898
7945
|
}
|
|
7899
7946
|
};
|
|
7900
|
-
function tryParseSkill(raw, fallbackName, source) {
|
|
7901
|
-
try {
|
|
7902
|
-
const frontmatter = parseSkillFrontmatter(raw, fallbackName);
|
|
7903
|
-
const metadata = {
|
|
7904
|
-
name: frontmatter.name,
|
|
7905
|
-
description: frontmatter.description,
|
|
7906
|
-
source
|
|
7907
|
-
};
|
|
7908
|
-
if (frontmatter.category !== void 0) metadata.category = frontmatter.category;
|
|
7909
|
-
if (frontmatter.dependencies !== void 0) metadata.dependencies = frontmatter.dependencies;
|
|
7910
|
-
return metadata;
|
|
7911
|
-
} catch (cause) {
|
|
7912
|
-
if (cause instanceof ConfigurationError) {
|
|
7913
|
-
const code = cause.code ?? "unknown";
|
|
7914
|
-
process.stderr.write(
|
|
7915
|
-
`[theokit-sdk] skill ${fallbackName} skipped (${code}): ${cause.message}
|
|
7916
|
-
`
|
|
7917
|
-
);
|
|
7918
|
-
return void 0;
|
|
7919
|
-
}
|
|
7920
|
-
throw cause;
|
|
7921
|
-
}
|
|
7922
|
-
}
|
|
7923
7947
|
|
|
7924
7948
|
// src/internal/runtime/local-agent/local-agent-bootstrap.ts
|
|
7925
7949
|
function registerLocalAgent(args) {
|
|
@@ -8332,6 +8356,7 @@ async function initLoopContext(inputs) {
|
|
|
8332
8356
|
finalStatus: "finished",
|
|
8333
8357
|
usage: new UsageAccumulator(),
|
|
8334
8358
|
nudgeAttempts: 0,
|
|
8359
|
+
stopFeedbackAttempts: 0,
|
|
8335
8360
|
...memoryProviderHandle !== void 0 ? { memoryProviderHandle } : {},
|
|
8336
8361
|
...memorySystemPromptAdditions !== void 0 ? { memorySystemPromptAdditions } : {}
|
|
8337
8362
|
};
|
|
@@ -8476,8 +8501,9 @@ function registerLoopError(ctx, cause) {
|
|
|
8476
8501
|
if (ctx.error !== void 0) return;
|
|
8477
8502
|
const rawMessage = cause?.message;
|
|
8478
8503
|
const message = typeof rawMessage === "string" ? rawMessage : cause instanceof Error ? cause.message : String(cause);
|
|
8504
|
+
const metaCode = cause?.metadata?.code;
|
|
8479
8505
|
const rawCode = cause?.code;
|
|
8480
|
-
const code = typeof rawCode === "string" ? rawCode : void 0;
|
|
8506
|
+
const code = typeof metaCode === "string" ? metaCode : typeof rawCode === "string" ? rawCode : void 0;
|
|
8481
8507
|
ctx.error = code !== void 0 ? { message, code, cause } : { message, cause };
|
|
8482
8508
|
}
|
|
8483
8509
|
async function runCollectorLoop(generator, inputs, ctx) {
|
|
@@ -9273,6 +9299,7 @@ function computeUsageCost(inputs, usage) {
|
|
|
9273
9299
|
|
|
9274
9300
|
// src/internal/agent-loop/loop.ts
|
|
9275
9301
|
var MAX_NUDGE_ATTEMPTS = 2;
|
|
9302
|
+
var MAX_STOP_FEEDBACK_ATTEMPTS = 2;
|
|
9276
9303
|
async function runAgentLoop(inputs) {
|
|
9277
9304
|
const sendSpan = inputs.telemetry?.startSpan("agent.send", {
|
|
9278
9305
|
agentId: inputs.agentId,
|
|
@@ -9430,6 +9457,28 @@ function shouldNudgeAndContinue(ctx, llmOutput) {
|
|
|
9430
9457
|
});
|
|
9431
9458
|
return true;
|
|
9432
9459
|
}
|
|
9460
|
+
async function reflectAfterStop(inputs, ctx) {
|
|
9461
|
+
const result = await inputs.hooks.run({
|
|
9462
|
+
event: "stop",
|
|
9463
|
+
agentId: inputs.agentId,
|
|
9464
|
+
runId: inputs.runId
|
|
9465
|
+
});
|
|
9466
|
+
if (result.blocked) return false;
|
|
9467
|
+
if (ctx.stopFeedbackAttempts >= MAX_STOP_FEEDBACK_ATTEMPTS) return false;
|
|
9468
|
+
const feedback = result.decisions.find(
|
|
9469
|
+
(d) => d.decision === "feedback" && (d.feedback ?? "").length > 0
|
|
9470
|
+
)?.feedback;
|
|
9471
|
+
if (feedback === void 0) return false;
|
|
9472
|
+
ctx.stopFeedbackAttempts += 1;
|
|
9473
|
+
ctx.messages.push({ role: "user", content: [{ type: "text", text: feedback }] });
|
|
9474
|
+
return true;
|
|
9475
|
+
}
|
|
9476
|
+
async function finishOrReflect(inputs, ctx, llmOutput) {
|
|
9477
|
+
if (shouldNudgeAndContinue(ctx, llmOutput)) return "continue";
|
|
9478
|
+
if (await reflectAfterStop(inputs, ctx)) return "continue";
|
|
9479
|
+
ctx.finalStatus = "finished";
|
|
9480
|
+
return "done";
|
|
9481
|
+
}
|
|
9433
9482
|
async function runIteration(inputs, ctx) {
|
|
9434
9483
|
const llmOutput = await streamLlmTurn(inputs, ctx);
|
|
9435
9484
|
accumulateUsage(ctx.usage, llmOutput);
|
|
@@ -9463,9 +9512,7 @@ async function continueOrTerminate(inputs, ctx, llmOutput) {
|
|
|
9463
9512
|
await emitAssistantTextStep(inputs, ctx, llmOutput.text);
|
|
9464
9513
|
}
|
|
9465
9514
|
if (llmOutput.stopReason !== "tool_use" || llmOutput.toolCalls.length === 0) {
|
|
9466
|
-
|
|
9467
|
-
ctx.finalStatus = "finished";
|
|
9468
|
-
return "done";
|
|
9515
|
+
return finishOrReflect(inputs, ctx, llmOutput);
|
|
9469
9516
|
}
|
|
9470
9517
|
ctx.messages.push(buildAssistantTurn(llmOutput.text, llmOutput.toolCalls));
|
|
9471
9518
|
const toolResults = await dispatchTools(inputs, ctx.tools, llmOutput.toolCalls, ctx.events);
|
|
@@ -15488,6 +15535,69 @@ setAgentFacade({
|
|
|
15488
15535
|
resume: (agentId, options) => Agent.resume(agentId, options),
|
|
15489
15536
|
batch: (prompts, options) => Agent.batch(prompts, options)
|
|
15490
15537
|
});
|
|
15538
|
+
var JsonlParseError = class extends Error {
|
|
15539
|
+
constructor(message, line) {
|
|
15540
|
+
super(message);
|
|
15541
|
+
this.line = line;
|
|
15542
|
+
this.name = "JsonlParseError";
|
|
15543
|
+
}
|
|
15544
|
+
line;
|
|
15545
|
+
};
|
|
15546
|
+
function isPlainObject(value) {
|
|
15547
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
15548
|
+
}
|
|
15549
|
+
function tryParseObjectLine(line) {
|
|
15550
|
+
if (line.length === 0) return void 0;
|
|
15551
|
+
let parsed;
|
|
15552
|
+
try {
|
|
15553
|
+
parsed = JSON.parse(line);
|
|
15554
|
+
} catch {
|
|
15555
|
+
return void 0;
|
|
15556
|
+
}
|
|
15557
|
+
return isPlainObject(parsed) ? parsed : void 0;
|
|
15558
|
+
}
|
|
15559
|
+
function loadJsonl(path, opts = {}) {
|
|
15560
|
+
const text = readFileSync(path, "utf8");
|
|
15561
|
+
const out = [];
|
|
15562
|
+
let lineNumber = 0;
|
|
15563
|
+
for (const rawLine of text.split("\n")) {
|
|
15564
|
+
lineNumber += 1;
|
|
15565
|
+
const line = rawLine.trim();
|
|
15566
|
+
if (line.length === 0) continue;
|
|
15567
|
+
let parsed;
|
|
15568
|
+
try {
|
|
15569
|
+
parsed = JSON.parse(line);
|
|
15570
|
+
} catch {
|
|
15571
|
+
throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
|
|
15572
|
+
}
|
|
15573
|
+
if (!isPlainObject(parsed)) {
|
|
15574
|
+
throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
|
|
15575
|
+
}
|
|
15576
|
+
out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
|
|
15577
|
+
}
|
|
15578
|
+
return out;
|
|
15579
|
+
}
|
|
15580
|
+
function appendJsonl(path, record) {
|
|
15581
|
+
mkdirSync(dirname(path), { recursive: true });
|
|
15582
|
+
appendFileSync(path, `${JSON.stringify(record)}
|
|
15583
|
+
`);
|
|
15584
|
+
}
|
|
15585
|
+
function readJsonlIds(path, keyFn) {
|
|
15586
|
+
const done = /* @__PURE__ */ new Set();
|
|
15587
|
+
let text;
|
|
15588
|
+
try {
|
|
15589
|
+
text = readFileSync(path, "utf8");
|
|
15590
|
+
} catch {
|
|
15591
|
+
return done;
|
|
15592
|
+
}
|
|
15593
|
+
for (const rawLine of text.split("\n")) {
|
|
15594
|
+
const parsed = tryParseObjectLine(rawLine.trim());
|
|
15595
|
+
if (parsed === void 0) continue;
|
|
15596
|
+
const key = keyFn(parsed);
|
|
15597
|
+
if (typeof key === "string" && key.length > 0) done.add(key);
|
|
15598
|
+
}
|
|
15599
|
+
return done;
|
|
15600
|
+
}
|
|
15491
15601
|
|
|
15492
15602
|
// src/internal/eval/runner.ts
|
|
15493
15603
|
init_agent_factory_registry();
|
|
@@ -15679,6 +15789,50 @@ function normalizeScorers(input) {
|
|
|
15679
15789
|
return { name: s.name, score: s.score };
|
|
15680
15790
|
});
|
|
15681
15791
|
}
|
|
15792
|
+
function probeRow(entry, index) {
|
|
15793
|
+
return {
|
|
15794
|
+
index,
|
|
15795
|
+
input: entry.input,
|
|
15796
|
+
output: "",
|
|
15797
|
+
...entry.expected !== void 0 ? { expected: entry.expected } : {},
|
|
15798
|
+
scores: [],
|
|
15799
|
+
meanScore: 0,
|
|
15800
|
+
durationMs: 0,
|
|
15801
|
+
...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
|
|
15802
|
+
};
|
|
15803
|
+
}
|
|
15804
|
+
function computeDoneKeys(persist) {
|
|
15805
|
+
if (persist.resume !== true) return /* @__PURE__ */ new Set();
|
|
15806
|
+
return readJsonlIds(
|
|
15807
|
+
persist.path,
|
|
15808
|
+
(parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
|
|
15809
|
+
);
|
|
15810
|
+
}
|
|
15811
|
+
function appendRowSafely(path, row) {
|
|
15812
|
+
try {
|
|
15813
|
+
appendJsonl(path, row);
|
|
15814
|
+
} catch (err) {
|
|
15815
|
+
console.warn(
|
|
15816
|
+
"[eval] persist append failed (ignored):",
|
|
15817
|
+
err instanceof Error ? err.message : err
|
|
15818
|
+
);
|
|
15819
|
+
}
|
|
15820
|
+
}
|
|
15821
|
+
function makeRowSink(persist, classify) {
|
|
15822
|
+
const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
|
|
15823
|
+
return {
|
|
15824
|
+
isResumed(entry, index) {
|
|
15825
|
+
if (persist === void 0 || doneKeys.size === 0) return false;
|
|
15826
|
+
return doneKeys.has(persist.key(probeRow(entry, index)));
|
|
15827
|
+
},
|
|
15828
|
+
finalize(row) {
|
|
15829
|
+
const outcome = classify?.(row);
|
|
15830
|
+
const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
|
|
15831
|
+
if (persist !== void 0) appendRowSafely(persist.path, finalRow);
|
|
15832
|
+
return finalRow;
|
|
15833
|
+
}
|
|
15834
|
+
};
|
|
15835
|
+
}
|
|
15682
15836
|
async function applyScorer(scorer, output, expected) {
|
|
15683
15837
|
let raw;
|
|
15684
15838
|
try {
|
|
@@ -15724,7 +15878,15 @@ function makeAgentForBatch(spec, _entries) {
|
|
|
15724
15878
|
}
|
|
15725
15879
|
return spec;
|
|
15726
15880
|
}
|
|
15727
|
-
async function
|
|
15881
|
+
async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
|
|
15882
|
+
const entry = entries[idx];
|
|
15883
|
+
if (entry === void 0) return;
|
|
15884
|
+
if (sink.isResumed(entry, idx)) return;
|
|
15885
|
+
const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
|
|
15886
|
+
rows[idx] = row;
|
|
15887
|
+
onRow(row, idx);
|
|
15888
|
+
}
|
|
15889
|
+
async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
|
|
15728
15890
|
const rows = new Array(entries.length);
|
|
15729
15891
|
const state2 = { cursor: 0 };
|
|
15730
15892
|
const worker = async () => {
|
|
@@ -15732,11 +15894,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
|
|
|
15732
15894
|
if (signal?.aborted === true) return;
|
|
15733
15895
|
const idx = state2.cursor;
|
|
15734
15896
|
state2.cursor += 1;
|
|
15735
|
-
|
|
15736
|
-
if (entry === void 0) continue;
|
|
15737
|
-
const row = await runOneEntry(spec, entry, idx, scorers);
|
|
15738
|
-
rows[idx] = row;
|
|
15739
|
-
onRow(row, idx);
|
|
15897
|
+
await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
|
|
15740
15898
|
}
|
|
15741
15899
|
};
|
|
15742
15900
|
const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
|
|
@@ -15786,23 +15944,32 @@ async function scoreBatchOutput(br, expected, scorers) {
|
|
|
15786
15944
|
}
|
|
15787
15945
|
return scoreEntries;
|
|
15788
15946
|
}
|
|
15789
|
-
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
|
|
15790
|
-
const
|
|
15947
|
+
async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
|
|
15948
|
+
const pending = [];
|
|
15949
|
+
for (let i = 0; i < entries.length; i += 1) {
|
|
15950
|
+
const entry = entries[i];
|
|
15951
|
+
if (entry === void 0) continue;
|
|
15952
|
+
if (sink.isResumed(entry, i)) continue;
|
|
15953
|
+
pending.push({ entry, index: i });
|
|
15954
|
+
}
|
|
15791
15955
|
const batchOpts = {
|
|
15792
15956
|
...agentOptions,
|
|
15793
15957
|
concurrency,
|
|
15794
15958
|
...signal !== void 0 ? { signal } : {}
|
|
15795
15959
|
};
|
|
15796
|
-
const batchResults = await getAgentFacade().batch(
|
|
15960
|
+
const batchResults = await getAgentFacade().batch(
|
|
15961
|
+
pending.map((p) => p.entry.input),
|
|
15962
|
+
batchOpts
|
|
15963
|
+
);
|
|
15797
15964
|
const rows = [];
|
|
15798
15965
|
for (let i = 0; i < batchResults.length; i += 1) {
|
|
15799
|
-
const
|
|
15966
|
+
const slot = pending[i];
|
|
15800
15967
|
const br = batchResults[i];
|
|
15801
|
-
if (
|
|
15802
|
-
const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
|
|
15803
|
-
const row = rowFromBatchResult(entry, br, scoreEntries,
|
|
15968
|
+
if (slot === void 0 || br === void 0) continue;
|
|
15969
|
+
const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
|
|
15970
|
+
const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
|
|
15804
15971
|
rows.push(row);
|
|
15805
|
-
onRow(row,
|
|
15972
|
+
onRow(row, slot.index);
|
|
15806
15973
|
}
|
|
15807
15974
|
return rows;
|
|
15808
15975
|
}
|
|
@@ -15827,6 +15994,7 @@ async function runEval(options, runOpts) {
|
|
|
15827
15994
|
const onRow = (row, i) => {
|
|
15828
15995
|
safeHook(() => hooks?.afterRow?.(row, i));
|
|
15829
15996
|
};
|
|
15997
|
+
const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
|
|
15830
15998
|
let rows;
|
|
15831
15999
|
if (isAgentInstance(options.agent) || typeof options.agent === "function") {
|
|
15832
16000
|
rows = await runRowsManually(
|
|
@@ -15835,11 +16003,12 @@ async function runEval(options, runOpts) {
|
|
|
15835
16003
|
scorers,
|
|
15836
16004
|
concurrency,
|
|
15837
16005
|
signal,
|
|
15838
|
-
onRow
|
|
16006
|
+
onRow,
|
|
16007
|
+
sink
|
|
15839
16008
|
);
|
|
15840
16009
|
} else {
|
|
15841
16010
|
const batchOpts = makeAgentForBatch(options.agent, indexed);
|
|
15842
|
-
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
|
|
16011
|
+
rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
|
|
15843
16012
|
}
|
|
15844
16013
|
const aggregate = computeAggregate(rows);
|
|
15845
16014
|
const endedAt = Date.now();
|
|
@@ -15865,6 +16034,25 @@ async function runEval(options, runOpts) {
|
|
|
15865
16034
|
}
|
|
15866
16035
|
}
|
|
15867
16036
|
|
|
16037
|
+
// src/sandbox/shell-escape.ts
|
|
16038
|
+
function shellEscapePosix(arg) {
|
|
16039
|
+
return `'${arg.replace(/'/g, "'\\''")}'`;
|
|
16040
|
+
}
|
|
16041
|
+
|
|
16042
|
+
// src/internal/eval/code-runner.ts
|
|
16043
|
+
var ARTIFACT_PATCH = ".theo-artifact.patch";
|
|
16044
|
+
async function captureArtifact(sandbox, repoDir) {
|
|
16045
|
+
const dir = shellEscapePosix(repoDir);
|
|
16046
|
+
const diffRes = await sandbox.execute(`git -C ${dir} diff`);
|
|
16047
|
+
const diff = diffRes.stdout;
|
|
16048
|
+
if (diff.length === 0) return { diff: "", applies: false };
|
|
16049
|
+
await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
|
|
16050
|
+
const check = await sandbox.execute(
|
|
16051
|
+
`git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
|
|
16052
|
+
);
|
|
16053
|
+
return { diff, applies: check.exitCode === 0 };
|
|
16054
|
+
}
|
|
16055
|
+
|
|
15868
16056
|
// src/internal/scorers/llm-judge.ts
|
|
15869
16057
|
init_agent_factory_registry();
|
|
15870
16058
|
function buildPrompt(subject, criteria, rubric, expected) {
|
|
@@ -16007,6 +16195,38 @@ var Scorers = {
|
|
|
16007
16195
|
}
|
|
16008
16196
|
};
|
|
16009
16197
|
},
|
|
16198
|
+
/**
|
|
16199
|
+
* Verify-gate scorer (M6-2): runs the project's tests in the provisioned
|
|
16200
|
+
* repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
|
|
16201
|
+
* else `0` with the exit code + truncated stderr in `reason`. Grades the
|
|
16202
|
+
* artifact captured by `captureArtifact` (D2 — rides `execute`, never a
|
|
16203
|
+
* direct `child_process`).
|
|
16204
|
+
*
|
|
16205
|
+
* SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
|
|
16206
|
+
* of the (potentially untrusted, dataset-derived) test identifiers. There is
|
|
16207
|
+
* NO default that runs bare test names — that would interpolate untrusted
|
|
16208
|
+
* `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
|
|
16209
|
+
* by the SDK; the test list is the builder's responsibility to render safely.
|
|
16210
|
+
*
|
|
16211
|
+
* PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
|
|
16212
|
+
* assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
|
|
16213
|
+
* that rejects shell metacharacters in `execute` is unsupported for this scorer.
|
|
16214
|
+
*/
|
|
16215
|
+
verifyGate(opts) {
|
|
16216
|
+
const { sandbox, repoDir, failToPass, passToPass, command } = opts;
|
|
16217
|
+
return {
|
|
16218
|
+
name: "verify-gate",
|
|
16219
|
+
score: async () => {
|
|
16220
|
+
const cmd = command([...failToPass, ...passToPass]).trim();
|
|
16221
|
+
if (cmd.length === 0) {
|
|
16222
|
+
return { score: 0, reason: "verify_gate_empty_command" };
|
|
16223
|
+
}
|
|
16224
|
+
const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
|
|
16225
|
+
if (r.exitCode === 0) return { score: 1 };
|
|
16226
|
+
return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
|
|
16227
|
+
}
|
|
16228
|
+
};
|
|
16229
|
+
},
|
|
16010
16230
|
jsonShape(schema, opts = {}) {
|
|
16011
16231
|
return {
|
|
16012
16232
|
name: "json-shape",
|
|
@@ -16079,6 +16299,6 @@ var Eval = class _Eval {
|
|
|
16079
16299
|
}
|
|
16080
16300
|
};
|
|
16081
16301
|
|
|
16082
|
-
export { Eval, EvalAlreadyRunningError, Scorers };
|
|
16302
|
+
export { Eval, EvalAlreadyRunningError, JsonlParseError, Scorers, captureArtifact, loadJsonl };
|
|
16083
16303
|
//# sourceMappingURL=eval.js.map
|
|
16084
16304
|
//# sourceMappingURL=eval.js.map
|