@theokit/sdk 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +113 -0
  2. package/dist/a2a/index.cjs +103 -48
  3. package/dist/a2a/index.cjs.map +1 -1
  4. package/dist/a2a/index.js +104 -49
  5. package/dist/a2a/index.js.map +1 -1
  6. package/dist/compaction.cjs +78 -0
  7. package/dist/compaction.cjs.map +1 -0
  8. package/dist/compaction.d.cts +76 -0
  9. package/dist/compaction.d.ts +76 -0
  10. package/dist/compaction.js +70 -0
  11. package/dist/compaction.js.map +1 -0
  12. package/dist/{cron-B_H8rn-j.d.cts → cron-B656C3iq.d.cts} +8 -0
  13. package/dist/{cron-DX6HbHxd.d.ts → cron-CM2M9mhB.d.ts} +8 -0
  14. package/dist/cron.cjs +104 -57
  15. package/dist/cron.cjs.map +1 -1
  16. package/dist/cron.d.cts +1 -1
  17. package/dist/cron.d.ts +1 -1
  18. package/dist/cron.js +104 -57
  19. package/dist/cron.js.map +1 -1
  20. package/dist/eval.cjs +296 -73
  21. package/dist/eval.cjs.map +1 -1
  22. package/dist/eval.d.cts +2 -0
  23. package/dist/eval.d.ts +2 -0
  24. package/dist/eval.js +295 -75
  25. package/dist/eval.js.map +1 -1
  26. package/dist/index.cjs +135 -65
  27. package/dist/index.cjs.map +1 -1
  28. package/dist/index.d.cts +42 -7
  29. package/dist/index.d.ts +42 -7
  30. package/dist/index.js +135 -66
  31. package/dist/index.js.map +1 -1
  32. package/dist/internal/agent-loop/loop.d.ts +5 -0
  33. package/dist/internal/eval/code-runner.d.ts +28 -0
  34. package/dist/internal/llm/model-capabilities.d.ts +40 -0
  35. package/dist/internal/llm/model-identifier.d.ts +9 -1
  36. package/dist/internal/llm/model-option.d.ts +38 -0
  37. package/dist/internal/persistence/index.cjs +68 -0
  38. package/dist/internal/persistence/index.cjs.map +1 -1
  39. package/dist/internal/persistence/index.d.cts +1 -0
  40. package/dist/internal/persistence/index.d.ts +1 -0
  41. package/dist/internal/persistence/index.js +65 -1
  42. package/dist/internal/persistence/index.js.map +1 -1
  43. package/dist/internal/persistence/jsonl.d.cts +34 -0
  44. package/dist/internal/persistence/jsonl.d.ts +34 -0
  45. package/dist/internal/runtime/compression/compression-attempt.d.ts +24 -0
  46. package/dist/internal/runtime/compression/compression-config.d.ts +33 -0
  47. package/dist/internal/runtime/compression/compression-decision.d.ts +10 -0
  48. package/dist/internal/runtime/compression/compression-helpers.d.ts +18 -0
  49. package/dist/internal/runtime/compression/compression-model-registry.d.ts +41 -0
  50. package/dist/internal/runtime/compression/compression-summarizer.d.ts +29 -0
  51. package/dist/internal/runtime/context/project-instructions.d.ts +66 -0
  52. package/dist/internal/runtime/context/replay-history.d.ts +43 -0
  53. package/dist/internal/runtime/hooks/hooks-frontmatter.d.ts +1 -1
  54. package/dist/internal/runtime/skills/discover-skills.d.ts +68 -0
  55. package/dist/internal/runtime/skills/skills-block.d.ts +18 -0
  56. package/dist/internal/runtime/skills/subagent-tool-scope.d.ts +25 -0
  57. package/dist/messages.cjs +24 -0
  58. package/dist/messages.cjs.map +1 -0
  59. package/dist/messages.d.cts +33 -0
  60. package/dist/messages.d.ts +33 -0
  61. package/dist/messages.js +20 -0
  62. package/dist/messages.js.map +1 -0
  63. package/dist/models.cjs +233 -0
  64. package/dist/models.cjs.map +1 -0
  65. package/dist/models.d.cts +16 -0
  66. package/dist/models.d.ts +16 -0
  67. package/dist/models.js +228 -0
  68. package/dist/models.js.map +1 -0
  69. package/dist/permission-engine.d.ts +12 -4
  70. package/dist/project.cjs +149 -0
  71. package/dist/project.cjs.map +1 -0
  72. package/dist/project.d.cts +14 -0
  73. package/dist/project.d.ts +14 -0
  74. package/dist/project.js +146 -0
  75. package/dist/project.js.map +1 -0
  76. package/dist/sandbox/index.cjs +71 -1
  77. package/dist/sandbox/index.cjs.map +1 -1
  78. package/dist/sandbox/index.d.cts +1 -0
  79. package/dist/sandbox/index.d.ts +1 -0
  80. package/dist/sandbox/index.js +70 -2
  81. package/dist/sandbox/index.js.map +1 -1
  82. package/dist/sandbox/provision.d.cts +53 -0
  83. package/dist/sandbox/provision.d.ts +53 -0
  84. package/dist/sandbox/shell-escape.d.cts +8 -0
  85. package/dist/sandbox/shell-escape.d.ts +8 -0
  86. package/dist/scorers.d.ts +19 -1
  87. package/dist/skills.cjs +282 -0
  88. package/dist/skills.cjs.map +1 -0
  89. package/dist/skills.d.cts +19 -0
  90. package/dist/skills.d.ts +19 -0
  91. package/dist/skills.js +279 -0
  92. package/dist/skills.js.map +1 -0
  93. package/dist/subagents.cjs +24 -0
  94. package/dist/subagents.cjs.map +1 -0
  95. package/dist/subagents.d.cts +14 -0
  96. package/dist/subagents.d.ts +14 -0
  97. package/dist/subagents.js +21 -0
  98. package/dist/subagents.js.map +1 -0
  99. package/dist/types/agent.d.ts +8 -0
  100. package/dist/types/eval.d.ts +71 -0
  101. package/package.json +74 -14
package/dist/eval.d.cts CHANGED
@@ -31,6 +31,8 @@ export declare class Eval {
31
31
  */
32
32
  run(runOpts?: EvalRunOptions): Promise<EvalRun>;
33
33
  }
34
+ export { captureArtifact } from "./internal/eval/code-runner.js";
34
35
  export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
36
+ export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
35
37
  export { Scorers } from "./scorers.js";
36
38
  export type * from "./types/eval.js";
package/dist/eval.d.ts CHANGED
@@ -31,6 +31,8 @@ export declare class Eval {
31
31
  */
32
32
  run(runOpts?: EvalRunOptions): Promise<EvalRun>;
33
33
  }
34
+ export { captureArtifact } from "./internal/eval/code-runner.js";
34
35
  export { EvalAlreadyRunningError } from "./internal/eval/single-flight.js";
36
+ export { JsonlParseError, loadJsonl } from "./internal/persistence/jsonl.js";
35
37
  export { Scorers } from "./scorers.js";
36
38
  export type * from "./types/eval.js";
package/dist/eval.js CHANGED
@@ -2,7 +2,7 @@ import { randomUUID, randomBytes, createHash } from 'crypto';
2
2
  import { readFile, unlink, mkdir, open, rename, statfs, stat, rm, readdir, appendFile, access } from 'fs/promises';
3
3
  import { join, dirname, resolve, sep, relative, isAbsolute } from 'path';
4
4
  import { z, toJSONSchema } from 'zod';
5
- import { mkdirSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync, readFileSync } from 'fs';
5
+ import { readFileSync, mkdirSync, appendFileSync, readdirSync, existsSync, realpathSync, lstatSync, readlinkSync } from 'fs';
6
6
  import { AsyncLocalStorage } from 'async_hooks';
7
7
  import { createRequire } from 'module';
8
8
  import { homedir } from 'os';
@@ -6496,6 +6496,8 @@ function parseSubagentMarkdown(raw, filename) {
6496
6496
  if (fields.model !== void 0) {
6497
6497
  definition.model = fields.model === "inherit" ? "inherit" : { id: fields.model };
6498
6498
  }
6499
+ const tools = fields.tools?.split(/[\s,]+/).map((t) => t.trim()).filter((t) => t.length > 0);
6500
+ if (tools !== void 0 && tools.length > 0) definition.tools = tools;
6499
6501
  return { name, definition };
6500
6502
  }
6501
6503
  function splitFrontmatter2(raw, filename) {
@@ -6659,21 +6661,24 @@ ${lines.join("\n")}
6659
6661
  }
6660
6662
  };
6661
6663
 
6664
+ // src/internal/runtime/skills/skills-block.ts
6665
+ function buildSkillsBlock(skills) {
6666
+ if (skills.length === 0) return void 0;
6667
+ const lines = skills.map(
6668
+ (skill) => ` - ${escapeBlockBody(skill.name)}: ${escapeBlockBody(skill.description)}`
6669
+ );
6670
+ return `<skills>
6671
+ ${lines.join("\n")}
6672
+ </skills>`;
6673
+ }
6674
+
6662
6675
  // src/internal/runtime/system-prompt/sources/skills-provider.ts
6663
6676
  var SkillsPromptProvider = class {
6664
6677
  id = "skills";
6665
6678
  priority = 20;
6666
6679
  contribute(ctx) {
6667
6680
  if (ctx.skillsAutoInject === false) return Promise.resolve(void 0);
6668
- if (ctx.skills.length === 0) return Promise.resolve(void 0);
6669
- const lines = ctx.skills.map((skill) => {
6670
- const name = escapeBlockBody(skill.name);
6671
- const description = escapeBlockBody(skill.description);
6672
- return ` - ${name}: ${description}`;
6673
- });
6674
- return Promise.resolve(`<skills>
6675
- ${lines.join("\n")}
6676
- </skills>`);
6681
+ return Promise.resolve(buildSkillsBlock(ctx.skills));
6677
6682
  }
6678
6683
  };
6679
6684
 
@@ -7781,7 +7786,7 @@ async function loadPluginManifestFromMarkdown(pluginsRoot, folderName) {
7781
7786
  return metadata;
7782
7787
  }
7783
7788
 
7784
- // src/internal/runtime/skills/skills-manager.ts
7789
+ // src/internal/runtime/skills/discover-skills.ts
7785
7790
  init_errors();
7786
7791
 
7787
7792
  // src/internal/runtime/skills/skill-frontmatter.ts
@@ -7853,6 +7858,61 @@ function hasContent(value) {
7853
7858
  return value !== void 0 && value.trim().length > 0;
7854
7859
  }
7855
7860
 
7861
+ // src/internal/runtime/skills/discover-skills.ts
7862
+ async function discoverSkills(dir, options) {
7863
+ let entries;
7864
+ try {
7865
+ entries = await readWorkspaceDir(dir, "skills_read_error", "skills directory");
7866
+ } catch {
7867
+ return [];
7868
+ }
7869
+ const skills = [];
7870
+ for (const entry of entries) {
7871
+ if (!entry.isDirectory()) continue;
7872
+ let skillDir;
7873
+ try {
7874
+ skillDir = safePathJoin(dir, entry.name);
7875
+ assertNoSymlinkEscape(skillDir, dir);
7876
+ } catch {
7877
+ continue;
7878
+ }
7879
+ const skillPath = join(skillDir, "SKILL.md");
7880
+ let raw;
7881
+ try {
7882
+ raw = await readFile(skillPath, "utf8");
7883
+ } catch {
7884
+ continue;
7885
+ }
7886
+ const skill = tryParseSkill(raw, entry.name, skillPath, options);
7887
+ if (skill !== void 0) skills.push(skill);
7888
+ }
7889
+ return skills;
7890
+ }
7891
+ function tryParseSkill(raw, fallbackName, source, options) {
7892
+ try {
7893
+ const frontmatter = parseSkillFrontmatter(raw, fallbackName);
7894
+ const skill = {
7895
+ name: frontmatter.name,
7896
+ description: frontmatter.description,
7897
+ source
7898
+ };
7899
+ if (frontmatter.category !== void 0) skill.category = frontmatter.category;
7900
+ if (frontmatter.dependencies !== void 0) skill.dependencies = frontmatter.dependencies;
7901
+ return skill;
7902
+ } catch (cause) {
7903
+ if (cause instanceof ConfigurationError) {
7904
+ options?.onInvalidSkill?.({
7905
+ name: fallbackName,
7906
+ source,
7907
+ code: cause.code ?? "unknown",
7908
+ message: cause.message
7909
+ });
7910
+ return void 0;
7911
+ }
7912
+ throw cause;
7913
+ }
7914
+ }
7915
+
7856
7916
  // src/internal/runtime/skills/skills-manager.ts
7857
7917
  var SkillsManager = class {
7858
7918
  constructor(cwd, _enabled, settingSourcesIncludeProject) {
@@ -7870,56 +7930,20 @@ var SkillsManager = class {
7870
7930
  await this.refresh();
7871
7931
  }
7872
7932
  async refresh() {
7873
- this.skills = [];
7874
7933
  const skillsRoot = join(this.cwd, ".theokit", "skills");
7875
- const entries = await readWorkspaceDir(skillsRoot, "skills_read_error", "skills directory");
7876
- for (const entry of entries) {
7877
- if (!entry.isDirectory()) continue;
7878
- let skillDir;
7879
- try {
7880
- skillDir = safePathJoin(skillsRoot, entry.name);
7881
- assertNoSymlinkEscape(skillDir, skillsRoot);
7882
- } catch {
7883
- continue;
7884
- }
7885
- const skillPath = join(skillDir, "SKILL.md");
7886
- let raw;
7887
- try {
7888
- raw = await readFile(skillPath, "utf8");
7889
- } catch {
7890
- continue;
7934
+ this.skills = await discoverSkills(skillsRoot, {
7935
+ onInvalidSkill: (info) => {
7936
+ process.stderr.write(
7937
+ `[theokit-sdk] skill ${info.name} skipped (${info.code}): ${info.message}
7938
+ `
7939
+ );
7891
7940
  }
7892
- const metadata = tryParseSkill(raw, entry.name, skillPath);
7893
- if (metadata !== void 0) this.skills.push(metadata);
7894
- }
7941
+ });
7895
7942
  }
7896
7943
  list() {
7897
7944
  return Promise.resolve(this.skills);
7898
7945
  }
7899
7946
  };
7900
- function tryParseSkill(raw, fallbackName, source) {
7901
- try {
7902
- const frontmatter = parseSkillFrontmatter(raw, fallbackName);
7903
- const metadata = {
7904
- name: frontmatter.name,
7905
- description: frontmatter.description,
7906
- source
7907
- };
7908
- if (frontmatter.category !== void 0) metadata.category = frontmatter.category;
7909
- if (frontmatter.dependencies !== void 0) metadata.dependencies = frontmatter.dependencies;
7910
- return metadata;
7911
- } catch (cause) {
7912
- if (cause instanceof ConfigurationError) {
7913
- const code = cause.code ?? "unknown";
7914
- process.stderr.write(
7915
- `[theokit-sdk] skill ${fallbackName} skipped (${code}): ${cause.message}
7916
- `
7917
- );
7918
- return void 0;
7919
- }
7920
- throw cause;
7921
- }
7922
- }
7923
7947
 
7924
7948
  // src/internal/runtime/local-agent/local-agent-bootstrap.ts
7925
7949
  function registerLocalAgent(args) {
@@ -8332,6 +8356,7 @@ async function initLoopContext(inputs) {
8332
8356
  finalStatus: "finished",
8333
8357
  usage: new UsageAccumulator(),
8334
8358
  nudgeAttempts: 0,
8359
+ stopFeedbackAttempts: 0,
8335
8360
  ...memoryProviderHandle !== void 0 ? { memoryProviderHandle } : {},
8336
8361
  ...memorySystemPromptAdditions !== void 0 ? { memorySystemPromptAdditions } : {}
8337
8362
  };
@@ -8476,8 +8501,9 @@ function registerLoopError(ctx, cause) {
8476
8501
  if (ctx.error !== void 0) return;
8477
8502
  const rawMessage = cause?.message;
8478
8503
  const message = typeof rawMessage === "string" ? rawMessage : cause instanceof Error ? cause.message : String(cause);
8504
+ const metaCode = cause?.metadata?.code;
8479
8505
  const rawCode = cause?.code;
8480
- const code = typeof rawCode === "string" ? rawCode : void 0;
8506
+ const code = typeof metaCode === "string" ? metaCode : typeof rawCode === "string" ? rawCode : void 0;
8481
8507
  ctx.error = code !== void 0 ? { message, code, cause } : { message, cause };
8482
8508
  }
8483
8509
  async function runCollectorLoop(generator, inputs, ctx) {
@@ -9273,6 +9299,7 @@ function computeUsageCost(inputs, usage) {
9273
9299
 
9274
9300
  // src/internal/agent-loop/loop.ts
9275
9301
  var MAX_NUDGE_ATTEMPTS = 2;
9302
+ var MAX_STOP_FEEDBACK_ATTEMPTS = 2;
9276
9303
  async function runAgentLoop(inputs) {
9277
9304
  const sendSpan = inputs.telemetry?.startSpan("agent.send", {
9278
9305
  agentId: inputs.agentId,
@@ -9430,6 +9457,28 @@ function shouldNudgeAndContinue(ctx, llmOutput) {
9430
9457
  });
9431
9458
  return true;
9432
9459
  }
9460
+ async function reflectAfterStop(inputs, ctx) {
9461
+ const result = await inputs.hooks.run({
9462
+ event: "stop",
9463
+ agentId: inputs.agentId,
9464
+ runId: inputs.runId
9465
+ });
9466
+ if (result.blocked) return false;
9467
+ if (ctx.stopFeedbackAttempts >= MAX_STOP_FEEDBACK_ATTEMPTS) return false;
9468
+ const feedback = result.decisions.find(
9469
+ (d) => d.decision === "feedback" && (d.feedback ?? "").length > 0
9470
+ )?.feedback;
9471
+ if (feedback === void 0) return false;
9472
+ ctx.stopFeedbackAttempts += 1;
9473
+ ctx.messages.push({ role: "user", content: [{ type: "text", text: feedback }] });
9474
+ return true;
9475
+ }
9476
+ async function finishOrReflect(inputs, ctx, llmOutput) {
9477
+ if (shouldNudgeAndContinue(ctx, llmOutput)) return "continue";
9478
+ if (await reflectAfterStop(inputs, ctx)) return "continue";
9479
+ ctx.finalStatus = "finished";
9480
+ return "done";
9481
+ }
9433
9482
  async function runIteration(inputs, ctx) {
9434
9483
  const llmOutput = await streamLlmTurn(inputs, ctx);
9435
9484
  accumulateUsage(ctx.usage, llmOutput);
@@ -9463,9 +9512,7 @@ async function continueOrTerminate(inputs, ctx, llmOutput) {
9463
9512
  await emitAssistantTextStep(inputs, ctx, llmOutput.text);
9464
9513
  }
9465
9514
  if (llmOutput.stopReason !== "tool_use" || llmOutput.toolCalls.length === 0) {
9466
- if (shouldNudgeAndContinue(ctx, llmOutput)) return "continue";
9467
- ctx.finalStatus = "finished";
9468
- return "done";
9515
+ return finishOrReflect(inputs, ctx, llmOutput);
9469
9516
  }
9470
9517
  ctx.messages.push(buildAssistantTurn(llmOutput.text, llmOutput.toolCalls));
9471
9518
  const toolResults = await dispatchTools(inputs, ctx.tools, llmOutput.toolCalls, ctx.events);
@@ -15488,6 +15535,69 @@ setAgentFacade({
15488
15535
  resume: (agentId, options) => Agent.resume(agentId, options),
15489
15536
  batch: (prompts, options) => Agent.batch(prompts, options)
15490
15537
  });
15538
+ var JsonlParseError = class extends Error {
15539
+ constructor(message, line) {
15540
+ super(message);
15541
+ this.line = line;
15542
+ this.name = "JsonlParseError";
15543
+ }
15544
+ line;
15545
+ };
15546
+ function isPlainObject(value) {
15547
+ return typeof value === "object" && value !== null && !Array.isArray(value);
15548
+ }
15549
+ function tryParseObjectLine(line) {
15550
+ if (line.length === 0) return void 0;
15551
+ let parsed;
15552
+ try {
15553
+ parsed = JSON.parse(line);
15554
+ } catch {
15555
+ return void 0;
15556
+ }
15557
+ return isPlainObject(parsed) ? parsed : void 0;
15558
+ }
15559
+ function loadJsonl(path, opts = {}) {
15560
+ const text = readFileSync(path, "utf8");
15561
+ const out = [];
15562
+ let lineNumber = 0;
15563
+ for (const rawLine of text.split("\n")) {
15564
+ lineNumber += 1;
15565
+ const line = rawLine.trim();
15566
+ if (line.length === 0) continue;
15567
+ let parsed;
15568
+ try {
15569
+ parsed = JSON.parse(line);
15570
+ } catch {
15571
+ throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
15572
+ }
15573
+ if (!isPlainObject(parsed)) {
15574
+ throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
15575
+ }
15576
+ out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
15577
+ }
15578
+ return out;
15579
+ }
15580
+ function appendJsonl(path, record) {
15581
+ mkdirSync(dirname(path), { recursive: true });
15582
+ appendFileSync(path, `${JSON.stringify(record)}
15583
+ `);
15584
+ }
15585
+ function readJsonlIds(path, keyFn) {
15586
+ const done = /* @__PURE__ */ new Set();
15587
+ let text;
15588
+ try {
15589
+ text = readFileSync(path, "utf8");
15590
+ } catch {
15591
+ return done;
15592
+ }
15593
+ for (const rawLine of text.split("\n")) {
15594
+ const parsed = tryParseObjectLine(rawLine.trim());
15595
+ if (parsed === void 0) continue;
15596
+ const key = keyFn(parsed);
15597
+ if (typeof key === "string" && key.length > 0) done.add(key);
15598
+ }
15599
+ return done;
15600
+ }
15491
15601
 
15492
15602
  // src/internal/eval/runner.ts
15493
15603
  init_agent_factory_registry();
@@ -15679,6 +15789,50 @@ function normalizeScorers(input) {
15679
15789
  return { name: s.name, score: s.score };
15680
15790
  });
15681
15791
  }
15792
+ function probeRow(entry, index) {
15793
+ return {
15794
+ index,
15795
+ input: entry.input,
15796
+ output: "",
15797
+ ...entry.expected !== void 0 ? { expected: entry.expected } : {},
15798
+ scores: [],
15799
+ meanScore: 0,
15800
+ durationMs: 0,
15801
+ ...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
15802
+ };
15803
+ }
15804
+ function computeDoneKeys(persist) {
15805
+ if (persist.resume !== true) return /* @__PURE__ */ new Set();
15806
+ return readJsonlIds(
15807
+ persist.path,
15808
+ (parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
15809
+ );
15810
+ }
15811
+ function appendRowSafely(path, row) {
15812
+ try {
15813
+ appendJsonl(path, row);
15814
+ } catch (err) {
15815
+ console.warn(
15816
+ "[eval] persist append failed (ignored):",
15817
+ err instanceof Error ? err.message : err
15818
+ );
15819
+ }
15820
+ }
15821
+ function makeRowSink(persist, classify) {
15822
+ const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
15823
+ return {
15824
+ isResumed(entry, index) {
15825
+ if (persist === void 0 || doneKeys.size === 0) return false;
15826
+ return doneKeys.has(persist.key(probeRow(entry, index)));
15827
+ },
15828
+ finalize(row) {
15829
+ const outcome = classify?.(row);
15830
+ const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
15831
+ if (persist !== void 0) appendRowSafely(persist.path, finalRow);
15832
+ return finalRow;
15833
+ }
15834
+ };
15835
+ }
15682
15836
  async function applyScorer(scorer, output, expected) {
15683
15837
  let raw;
15684
15838
  try {
@@ -15724,7 +15878,15 @@ function makeAgentForBatch(spec, _entries) {
15724
15878
  }
15725
15879
  return spec;
15726
15880
  }
15727
- async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow) {
15881
+ async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
15882
+ const entry = entries[idx];
15883
+ if (entry === void 0) return;
15884
+ if (sink.isResumed(entry, idx)) return;
15885
+ const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
15886
+ rows[idx] = row;
15887
+ onRow(row, idx);
15888
+ }
15889
+ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
15728
15890
  const rows = new Array(entries.length);
15729
15891
  const state2 = { cursor: 0 };
15730
15892
  const worker = async () => {
@@ -15732,11 +15894,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
15732
15894
  if (signal?.aborted === true) return;
15733
15895
  const idx = state2.cursor;
15734
15896
  state2.cursor += 1;
15735
- const entry = entries[idx];
15736
- if (entry === void 0) continue;
15737
- const row = await runOneEntry(spec, entry, idx, scorers);
15738
- rows[idx] = row;
15739
- onRow(row, idx);
15897
+ await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
15740
15898
  }
15741
15899
  };
15742
15900
  const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
@@ -15786,23 +15944,32 @@ async function scoreBatchOutput(br, expected, scorers) {
15786
15944
  }
15787
15945
  return scoreEntries;
15788
15946
  }
15789
- async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
15790
- const prompts = entries.map((e) => e.input);
15947
+ async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
15948
+ const pending = [];
15949
+ for (let i = 0; i < entries.length; i += 1) {
15950
+ const entry = entries[i];
15951
+ if (entry === void 0) continue;
15952
+ if (sink.isResumed(entry, i)) continue;
15953
+ pending.push({ entry, index: i });
15954
+ }
15791
15955
  const batchOpts = {
15792
15956
  ...agentOptions,
15793
15957
  concurrency,
15794
15958
  ...signal !== void 0 ? { signal } : {}
15795
15959
  };
15796
- const batchResults = await getAgentFacade().batch(prompts, batchOpts);
15960
+ const batchResults = await getAgentFacade().batch(
15961
+ pending.map((p) => p.entry.input),
15962
+ batchOpts
15963
+ );
15797
15964
  const rows = [];
15798
15965
  for (let i = 0; i < batchResults.length; i += 1) {
15799
- const entry = entries[i];
15966
+ const slot = pending[i];
15800
15967
  const br = batchResults[i];
15801
- if (entry === void 0 || br === void 0) continue;
15802
- const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
15803
- const row = rowFromBatchResult(entry, br, scoreEntries, i);
15968
+ if (slot === void 0 || br === void 0) continue;
15969
+ const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
15970
+ const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
15804
15971
  rows.push(row);
15805
- onRow(row, i);
15972
+ onRow(row, slot.index);
15806
15973
  }
15807
15974
  return rows;
15808
15975
  }
@@ -15827,6 +15994,7 @@ async function runEval(options, runOpts) {
15827
15994
  const onRow = (row, i) => {
15828
15995
  safeHook(() => hooks?.afterRow?.(row, i));
15829
15996
  };
15997
+ const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
15830
15998
  let rows;
15831
15999
  if (isAgentInstance(options.agent) || typeof options.agent === "function") {
15832
16000
  rows = await runRowsManually(
@@ -15835,11 +16003,12 @@ async function runEval(options, runOpts) {
15835
16003
  scorers,
15836
16004
  concurrency,
15837
16005
  signal,
15838
- onRow
16006
+ onRow,
16007
+ sink
15839
16008
  );
15840
16009
  } else {
15841
16010
  const batchOpts = makeAgentForBatch(options.agent, indexed);
15842
- rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
16011
+ rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
15843
16012
  }
15844
16013
  const aggregate = computeAggregate(rows);
15845
16014
  const endedAt = Date.now();
@@ -15865,6 +16034,25 @@ async function runEval(options, runOpts) {
15865
16034
  }
15866
16035
  }
15867
16036
 
16037
+ // src/sandbox/shell-escape.ts
16038
+ function shellEscapePosix(arg) {
16039
+ return `'${arg.replace(/'/g, "'\\''")}'`;
16040
+ }
16041
+
16042
+ // src/internal/eval/code-runner.ts
16043
+ var ARTIFACT_PATCH = ".theo-artifact.patch";
16044
+ async function captureArtifact(sandbox, repoDir) {
16045
+ const dir = shellEscapePosix(repoDir);
16046
+ const diffRes = await sandbox.execute(`git -C ${dir} diff`);
16047
+ const diff = diffRes.stdout;
16048
+ if (diff.length === 0) return { diff: "", applies: false };
16049
+ await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
16050
+ const check = await sandbox.execute(
16051
+ `git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
16052
+ );
16053
+ return { diff, applies: check.exitCode === 0 };
16054
+ }
16055
+
15868
16056
  // src/internal/scorers/llm-judge.ts
15869
16057
  init_agent_factory_registry();
15870
16058
  function buildPrompt(subject, criteria, rubric, expected) {
@@ -16007,6 +16195,38 @@ var Scorers = {
16007
16195
  }
16008
16196
  };
16009
16197
  },
16198
+ /**
16199
+ * Verify-gate scorer (M6-2): runs the project's tests in the provisioned
16200
+ * repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
16201
+ * else `0` with the exit code + truncated stderr in `reason`. Grades the
16202
+ * artifact captured by `captureArtifact` (D2 — rides `execute`, never a
16203
+ * direct `child_process`).
16204
+ *
16205
+ * SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
16206
+ * of the (potentially untrusted, dataset-derived) test identifiers. There is
16207
+ * NO default that runs bare test names — that would interpolate untrusted
16208
+ * `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
16209
+ * by the SDK; the test list is the builder's responsibility to render safely.
16210
+ *
16211
+ * PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
16212
+ * assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
16213
+ * that rejects shell metacharacters in `execute` is unsupported for this scorer.
16214
+ */
16215
+ verifyGate(opts) {
16216
+ const { sandbox, repoDir, failToPass, passToPass, command } = opts;
16217
+ return {
16218
+ name: "verify-gate",
16219
+ score: async () => {
16220
+ const cmd = command([...failToPass, ...passToPass]).trim();
16221
+ if (cmd.length === 0) {
16222
+ return { score: 0, reason: "verify_gate_empty_command" };
16223
+ }
16224
+ const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
16225
+ if (r.exitCode === 0) return { score: 1 };
16226
+ return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
16227
+ }
16228
+ };
16229
+ },
16010
16230
  jsonShape(schema, opts = {}) {
16011
16231
  return {
16012
16232
  name: "json-shape",
@@ -16079,6 +16299,6 @@ var Eval = class _Eval {
16079
16299
  }
16080
16300
  };
16081
16301
 
16082
- export { Eval, EvalAlreadyRunningError, Scorers };
16302
+ export { Eval, EvalAlreadyRunningError, JsonlParseError, Scorers, captureArtifact, loadJsonl };
16083
16303
  //# sourceMappingURL=eval.js.map
16084
16304
  //# sourceMappingURL=eval.js.map