@theokit/sdk 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/CHANGELOG.md +113 -0
  2. package/dist/a2a/index.cjs +103 -48
  3. package/dist/a2a/index.cjs.map +1 -1
  4. package/dist/a2a/index.js +104 -49
  5. package/dist/a2a/index.js.map +1 -1
  6. package/dist/compaction.cjs +78 -0
  7. package/dist/compaction.cjs.map +1 -0
  8. package/dist/compaction.d.cts +76 -0
  9. package/dist/compaction.d.ts +76 -0
  10. package/dist/compaction.js +70 -0
  11. package/dist/compaction.js.map +1 -0
  12. package/dist/{cron-B_H8rn-j.d.cts → cron-B656C3iq.d.cts} +8 -0
  13. package/dist/{cron-DX6HbHxd.d.ts → cron-CM2M9mhB.d.ts} +8 -0
  14. package/dist/cron.cjs +104 -57
  15. package/dist/cron.cjs.map +1 -1
  16. package/dist/cron.d.cts +1 -1
  17. package/dist/cron.d.ts +1 -1
  18. package/dist/cron.js +104 -57
  19. package/dist/cron.js.map +1 -1
  20. package/dist/eval.cjs +296 -73
  21. package/dist/eval.cjs.map +1 -1
  22. package/dist/eval.d.cts +2 -0
  23. package/dist/eval.d.ts +2 -0
  24. package/dist/eval.js +295 -75
  25. package/dist/eval.js.map +1 -1
  26. package/dist/index.cjs +135 -65
  27. package/dist/index.cjs.map +1 -1
  28. package/dist/index.d.cts +42 -7
  29. package/dist/index.d.ts +42 -7
  30. package/dist/index.js +135 -66
  31. package/dist/index.js.map +1 -1
  32. package/dist/internal/agent-loop/loop.d.ts +5 -0
  33. package/dist/internal/eval/code-runner.d.ts +28 -0
  34. package/dist/internal/llm/model-capabilities.d.ts +40 -0
  35. package/dist/internal/llm/model-identifier.d.ts +9 -1
  36. package/dist/internal/llm/model-option.d.ts +38 -0
  37. package/dist/internal/persistence/index.cjs +68 -0
  38. package/dist/internal/persistence/index.cjs.map +1 -1
  39. package/dist/internal/persistence/index.d.cts +1 -0
  40. package/dist/internal/persistence/index.d.ts +1 -0
  41. package/dist/internal/persistence/index.js +65 -1
  42. package/dist/internal/persistence/index.js.map +1 -1
  43. package/dist/internal/persistence/jsonl.d.cts +34 -0
  44. package/dist/internal/persistence/jsonl.d.ts +34 -0
  45. package/dist/internal/runtime/compression/compression-attempt.d.ts +24 -0
  46. package/dist/internal/runtime/compression/compression-config.d.ts +33 -0
  47. package/dist/internal/runtime/compression/compression-decision.d.ts +10 -0
  48. package/dist/internal/runtime/compression/compression-helpers.d.ts +18 -0
  49. package/dist/internal/runtime/compression/compression-model-registry.d.ts +41 -0
  50. package/dist/internal/runtime/compression/compression-summarizer.d.ts +29 -0
  51. package/dist/internal/runtime/context/project-instructions.d.ts +66 -0
  52. package/dist/internal/runtime/context/replay-history.d.ts +43 -0
  53. package/dist/internal/runtime/hooks/hooks-frontmatter.d.ts +1 -1
  54. package/dist/internal/runtime/skills/discover-skills.d.ts +68 -0
  55. package/dist/internal/runtime/skills/skills-block.d.ts +18 -0
  56. package/dist/internal/runtime/skills/subagent-tool-scope.d.ts +25 -0
  57. package/dist/messages.cjs +24 -0
  58. package/dist/messages.cjs.map +1 -0
  59. package/dist/messages.d.cts +33 -0
  60. package/dist/messages.d.ts +33 -0
  61. package/dist/messages.js +20 -0
  62. package/dist/messages.js.map +1 -0
  63. package/dist/models.cjs +233 -0
  64. package/dist/models.cjs.map +1 -0
  65. package/dist/models.d.cts +16 -0
  66. package/dist/models.d.ts +16 -0
  67. package/dist/models.js +228 -0
  68. package/dist/models.js.map +1 -0
  69. package/dist/permission-engine.d.ts +12 -4
  70. package/dist/project.cjs +149 -0
  71. package/dist/project.cjs.map +1 -0
  72. package/dist/project.d.cts +14 -0
  73. package/dist/project.d.ts +14 -0
  74. package/dist/project.js +146 -0
  75. package/dist/project.js.map +1 -0
  76. package/dist/sandbox/index.cjs +71 -1
  77. package/dist/sandbox/index.cjs.map +1 -1
  78. package/dist/sandbox/index.d.cts +1 -0
  79. package/dist/sandbox/index.d.ts +1 -0
  80. package/dist/sandbox/index.js +70 -2
  81. package/dist/sandbox/index.js.map +1 -1
  82. package/dist/sandbox/provision.d.cts +53 -0
  83. package/dist/sandbox/provision.d.ts +53 -0
  84. package/dist/sandbox/shell-escape.d.cts +8 -0
  85. package/dist/sandbox/shell-escape.d.ts +8 -0
  86. package/dist/scorers.d.ts +19 -1
  87. package/dist/skills.cjs +282 -0
  88. package/dist/skills.cjs.map +1 -0
  89. package/dist/skills.d.cts +19 -0
  90. package/dist/skills.d.ts +19 -0
  91. package/dist/skills.js +279 -0
  92. package/dist/skills.js.map +1 -0
  93. package/dist/subagents.cjs +24 -0
  94. package/dist/subagents.cjs.map +1 -0
  95. package/dist/subagents.d.cts +14 -0
  96. package/dist/subagents.d.ts +14 -0
  97. package/dist/subagents.js +21 -0
  98. package/dist/subagents.js.map +1 -0
  99. package/dist/types/agent.d.ts +8 -0
  100. package/dist/types/eval.d.ts +71 -0
  101. package/package.json +74 -14
package/dist/eval.cjs CHANGED
@@ -6499,6 +6499,8 @@ function parseSubagentMarkdown(raw, filename) {
6499
6499
  if (fields.model !== void 0) {
6500
6500
  definition.model = fields.model === "inherit" ? "inherit" : { id: fields.model };
6501
6501
  }
6502
+ const tools = fields.tools?.split(/[\s,]+/).map((t) => t.trim()).filter((t) => t.length > 0);
6503
+ if (tools !== void 0 && tools.length > 0) definition.tools = tools;
6502
6504
  return { name, definition };
6503
6505
  }
6504
6506
  function splitFrontmatter2(raw, filename) {
@@ -6662,21 +6664,24 @@ ${lines.join("\n")}
6662
6664
  }
6663
6665
  };
6664
6666
 
6667
+ // src/internal/runtime/skills/skills-block.ts
6668
+ function buildSkillsBlock(skills) {
6669
+ if (skills.length === 0) return void 0;
6670
+ const lines = skills.map(
6671
+ (skill) => ` - ${escapeBlockBody(skill.name)}: ${escapeBlockBody(skill.description)}`
6672
+ );
6673
+ return `<skills>
6674
+ ${lines.join("\n")}
6675
+ </skills>`;
6676
+ }
6677
+
6665
6678
  // src/internal/runtime/system-prompt/sources/skills-provider.ts
6666
6679
  var SkillsPromptProvider = class {
6667
6680
  id = "skills";
6668
6681
  priority = 20;
6669
6682
  contribute(ctx) {
6670
6683
  if (ctx.skillsAutoInject === false) return Promise.resolve(void 0);
6671
- if (ctx.skills.length === 0) return Promise.resolve(void 0);
6672
- const lines = ctx.skills.map((skill) => {
6673
- const name = escapeBlockBody(skill.name);
6674
- const description = escapeBlockBody(skill.description);
6675
- return ` - ${name}: ${description}`;
6676
- });
6677
- return Promise.resolve(`<skills>
6678
- ${lines.join("\n")}
6679
- </skills>`);
6684
+ return Promise.resolve(buildSkillsBlock(ctx.skills));
6680
6685
  }
6681
6686
  };
6682
6687
 
@@ -7784,7 +7789,7 @@ async function loadPluginManifestFromMarkdown(pluginsRoot, folderName) {
7784
7789
  return metadata;
7785
7790
  }
7786
7791
 
7787
- // src/internal/runtime/skills/skills-manager.ts
7792
+ // src/internal/runtime/skills/discover-skills.ts
7788
7793
  init_errors();
7789
7794
 
7790
7795
  // src/internal/runtime/skills/skill-frontmatter.ts
@@ -7856,6 +7861,61 @@ function hasContent(value) {
7856
7861
  return value !== void 0 && value.trim().length > 0;
7857
7862
  }
7858
7863
 
7864
+ // src/internal/runtime/skills/discover-skills.ts
7865
+ async function discoverSkills(dir, options) {
7866
+ let entries;
7867
+ try {
7868
+ entries = await readWorkspaceDir(dir, "skills_read_error", "skills directory");
7869
+ } catch {
7870
+ return [];
7871
+ }
7872
+ const skills = [];
7873
+ for (const entry of entries) {
7874
+ if (!entry.isDirectory()) continue;
7875
+ let skillDir;
7876
+ try {
7877
+ skillDir = safePathJoin(dir, entry.name);
7878
+ assertNoSymlinkEscape(skillDir, dir);
7879
+ } catch {
7880
+ continue;
7881
+ }
7882
+ const skillPath = path.join(skillDir, "SKILL.md");
7883
+ let raw;
7884
+ try {
7885
+ raw = await promises.readFile(skillPath, "utf8");
7886
+ } catch {
7887
+ continue;
7888
+ }
7889
+ const skill = tryParseSkill(raw, entry.name, skillPath, options);
7890
+ if (skill !== void 0) skills.push(skill);
7891
+ }
7892
+ return skills;
7893
+ }
7894
+ function tryParseSkill(raw, fallbackName, source, options) {
7895
+ try {
7896
+ const frontmatter = parseSkillFrontmatter(raw, fallbackName);
7897
+ const skill = {
7898
+ name: frontmatter.name,
7899
+ description: frontmatter.description,
7900
+ source
7901
+ };
7902
+ if (frontmatter.category !== void 0) skill.category = frontmatter.category;
7903
+ if (frontmatter.dependencies !== void 0) skill.dependencies = frontmatter.dependencies;
7904
+ return skill;
7905
+ } catch (cause) {
7906
+ if (cause instanceof ConfigurationError) {
7907
+ options?.onInvalidSkill?.({
7908
+ name: fallbackName,
7909
+ source,
7910
+ code: cause.code ?? "unknown",
7911
+ message: cause.message
7912
+ });
7913
+ return void 0;
7914
+ }
7915
+ throw cause;
7916
+ }
7917
+ }
7918
+
7859
7919
  // src/internal/runtime/skills/skills-manager.ts
7860
7920
  var SkillsManager = class {
7861
7921
  constructor(cwd, _enabled, settingSourcesIncludeProject) {
@@ -7873,56 +7933,20 @@ var SkillsManager = class {
7873
7933
  await this.refresh();
7874
7934
  }
7875
7935
  async refresh() {
7876
- this.skills = [];
7877
7936
  const skillsRoot = path.join(this.cwd, ".theokit", "skills");
7878
- const entries = await readWorkspaceDir(skillsRoot, "skills_read_error", "skills directory");
7879
- for (const entry of entries) {
7880
- if (!entry.isDirectory()) continue;
7881
- let skillDir;
7882
- try {
7883
- skillDir = safePathJoin(skillsRoot, entry.name);
7884
- assertNoSymlinkEscape(skillDir, skillsRoot);
7885
- } catch {
7886
- continue;
7887
- }
7888
- const skillPath = path.join(skillDir, "SKILL.md");
7889
- let raw;
7890
- try {
7891
- raw = await promises.readFile(skillPath, "utf8");
7892
- } catch {
7893
- continue;
7937
+ this.skills = await discoverSkills(skillsRoot, {
7938
+ onInvalidSkill: (info) => {
7939
+ process.stderr.write(
7940
+ `[theokit-sdk] skill ${info.name} skipped (${info.code}): ${info.message}
7941
+ `
7942
+ );
7894
7943
  }
7895
- const metadata = tryParseSkill(raw, entry.name, skillPath);
7896
- if (metadata !== void 0) this.skills.push(metadata);
7897
- }
7944
+ });
7898
7945
  }
7899
7946
  list() {
7900
7947
  return Promise.resolve(this.skills);
7901
7948
  }
7902
7949
  };
7903
- function tryParseSkill(raw, fallbackName, source) {
7904
- try {
7905
- const frontmatter = parseSkillFrontmatter(raw, fallbackName);
7906
- const metadata = {
7907
- name: frontmatter.name,
7908
- description: frontmatter.description,
7909
- source
7910
- };
7911
- if (frontmatter.category !== void 0) metadata.category = frontmatter.category;
7912
- if (frontmatter.dependencies !== void 0) metadata.dependencies = frontmatter.dependencies;
7913
- return metadata;
7914
- } catch (cause) {
7915
- if (cause instanceof ConfigurationError) {
7916
- const code = cause.code ?? "unknown";
7917
- process.stderr.write(
7918
- `[theokit-sdk] skill ${fallbackName} skipped (${code}): ${cause.message}
7919
- `
7920
- );
7921
- return void 0;
7922
- }
7923
- throw cause;
7924
- }
7925
- }
7926
7950
 
7927
7951
  // src/internal/runtime/local-agent/local-agent-bootstrap.ts
7928
7952
  function registerLocalAgent(args) {
@@ -8335,6 +8359,7 @@ async function initLoopContext(inputs) {
8335
8359
  finalStatus: "finished",
8336
8360
  usage: new UsageAccumulator(),
8337
8361
  nudgeAttempts: 0,
8362
+ stopFeedbackAttempts: 0,
8338
8363
  ...memoryProviderHandle !== void 0 ? { memoryProviderHandle } : {},
8339
8364
  ...memorySystemPromptAdditions !== void 0 ? { memorySystemPromptAdditions } : {}
8340
8365
  };
@@ -8479,8 +8504,9 @@ function registerLoopError(ctx, cause) {
8479
8504
  if (ctx.error !== void 0) return;
8480
8505
  const rawMessage = cause?.message;
8481
8506
  const message = typeof rawMessage === "string" ? rawMessage : cause instanceof Error ? cause.message : String(cause);
8507
+ const metaCode = cause?.metadata?.code;
8482
8508
  const rawCode = cause?.code;
8483
- const code = typeof rawCode === "string" ? rawCode : void 0;
8509
+ const code = typeof metaCode === "string" ? metaCode : typeof rawCode === "string" ? rawCode : void 0;
8484
8510
  ctx.error = code !== void 0 ? { message, code, cause } : { message, cause };
8485
8511
  }
8486
8512
  async function runCollectorLoop(generator, inputs, ctx) {
@@ -9276,6 +9302,7 @@ function computeUsageCost(inputs, usage) {
9276
9302
 
9277
9303
  // src/internal/agent-loop/loop.ts
9278
9304
  var MAX_NUDGE_ATTEMPTS = 2;
9305
+ var MAX_STOP_FEEDBACK_ATTEMPTS = 2;
9279
9306
  async function runAgentLoop(inputs) {
9280
9307
  const sendSpan = inputs.telemetry?.startSpan("agent.send", {
9281
9308
  agentId: inputs.agentId,
@@ -9433,6 +9460,28 @@ function shouldNudgeAndContinue(ctx, llmOutput) {
9433
9460
  });
9434
9461
  return true;
9435
9462
  }
9463
+ async function reflectAfterStop(inputs, ctx) {
9464
+ const result = await inputs.hooks.run({
9465
+ event: "stop",
9466
+ agentId: inputs.agentId,
9467
+ runId: inputs.runId
9468
+ });
9469
+ if (result.blocked) return false;
9470
+ if (ctx.stopFeedbackAttempts >= MAX_STOP_FEEDBACK_ATTEMPTS) return false;
9471
+ const feedback = result.decisions.find(
9472
+ (d) => d.decision === "feedback" && (d.feedback ?? "").length > 0
9473
+ )?.feedback;
9474
+ if (feedback === void 0) return false;
9475
+ ctx.stopFeedbackAttempts += 1;
9476
+ ctx.messages.push({ role: "user", content: [{ type: "text", text: feedback }] });
9477
+ return true;
9478
+ }
9479
+ async function finishOrReflect(inputs, ctx, llmOutput) {
9480
+ if (shouldNudgeAndContinue(ctx, llmOutput)) return "continue";
9481
+ if (await reflectAfterStop(inputs, ctx)) return "continue";
9482
+ ctx.finalStatus = "finished";
9483
+ return "done";
9484
+ }
9436
9485
  async function runIteration(inputs, ctx) {
9437
9486
  const llmOutput = await streamLlmTurn(inputs, ctx);
9438
9487
  accumulateUsage(ctx.usage, llmOutput);
@@ -9466,9 +9515,7 @@ async function continueOrTerminate(inputs, ctx, llmOutput) {
9466
9515
  await emitAssistantTextStep(inputs, ctx, llmOutput.text);
9467
9516
  }
9468
9517
  if (llmOutput.stopReason !== "tool_use" || llmOutput.toolCalls.length === 0) {
9469
- if (shouldNudgeAndContinue(ctx, llmOutput)) return "continue";
9470
- ctx.finalStatus = "finished";
9471
- return "done";
9518
+ return finishOrReflect(inputs, ctx, llmOutput);
9472
9519
  }
9473
9520
  ctx.messages.push(buildAssistantTurn(llmOutput.text, llmOutput.toolCalls));
9474
9521
  const toolResults = await dispatchTools(inputs, ctx.tools, llmOutput.toolCalls, ctx.events);
@@ -15491,6 +15538,69 @@ setAgentFacade({
15491
15538
  resume: (agentId, options) => Agent.resume(agentId, options),
15492
15539
  batch: (prompts, options) => Agent.batch(prompts, options)
15493
15540
  });
15541
+ var JsonlParseError = class extends Error {
15542
+ constructor(message, line) {
15543
+ super(message);
15544
+ this.line = line;
15545
+ this.name = "JsonlParseError";
15546
+ }
15547
+ line;
15548
+ };
15549
+ function isPlainObject(value) {
15550
+ return typeof value === "object" && value !== null && !Array.isArray(value);
15551
+ }
15552
+ function tryParseObjectLine(line) {
15553
+ if (line.length === 0) return void 0;
15554
+ let parsed;
15555
+ try {
15556
+ parsed = JSON.parse(line);
15557
+ } catch {
15558
+ return void 0;
15559
+ }
15560
+ return isPlainObject(parsed) ? parsed : void 0;
15561
+ }
15562
+ function loadJsonl(path, opts = {}) {
15563
+ const text = fs.readFileSync(path, "utf8");
15564
+ const out = [];
15565
+ let lineNumber = 0;
15566
+ for (const rawLine of text.split("\n")) {
15567
+ lineNumber += 1;
15568
+ const line = rawLine.trim();
15569
+ if (line.length === 0) continue;
15570
+ let parsed;
15571
+ try {
15572
+ parsed = JSON.parse(line);
15573
+ } catch {
15574
+ throw new JsonlParseError(`line ${lineNumber}: invalid JSON`, lineNumber);
15575
+ }
15576
+ if (!isPlainObject(parsed)) {
15577
+ throw new JsonlParseError(`line ${lineNumber}: not a JSON object`, lineNumber);
15578
+ }
15579
+ out.push(opts.map ? opts.map(parsed, lineNumber) : parsed);
15580
+ }
15581
+ return out;
15582
+ }
15583
+ function appendJsonl(path$1, record) {
15584
+ fs.mkdirSync(path.dirname(path$1), { recursive: true });
15585
+ fs.appendFileSync(path$1, `${JSON.stringify(record)}
15586
+ `);
15587
+ }
15588
+ function readJsonlIds(path, keyFn) {
15589
+ const done = /* @__PURE__ */ new Set();
15590
+ let text;
15591
+ try {
15592
+ text = fs.readFileSync(path, "utf8");
15593
+ } catch {
15594
+ return done;
15595
+ }
15596
+ for (const rawLine of text.split("\n")) {
15597
+ const parsed = tryParseObjectLine(rawLine.trim());
15598
+ if (parsed === void 0) continue;
15599
+ const key = keyFn(parsed);
15600
+ if (typeof key === "string" && key.length > 0) done.add(key);
15601
+ }
15602
+ return done;
15603
+ }
15494
15604
 
15495
15605
  // src/internal/eval/runner.ts
15496
15606
  init_agent_factory_registry();
@@ -15682,6 +15792,50 @@ function normalizeScorers(input) {
15682
15792
  return { name: s.name, score: s.score };
15683
15793
  });
15684
15794
  }
15795
+ function probeRow(entry, index) {
15796
+ return {
15797
+ index,
15798
+ input: entry.input,
15799
+ output: "",
15800
+ ...entry.expected !== void 0 ? { expected: entry.expected } : {},
15801
+ scores: [],
15802
+ meanScore: 0,
15803
+ durationMs: 0,
15804
+ ...entry.metadata !== void 0 ? { metadata: entry.metadata } : {}
15805
+ };
15806
+ }
15807
+ function computeDoneKeys(persist) {
15808
+ if (persist.resume !== true) return /* @__PURE__ */ new Set();
15809
+ return readJsonlIds(
15810
+ persist.path,
15811
+ (parsed) => parsed.error === void 0 ? persist.key(parsed) : void 0
15812
+ );
15813
+ }
15814
+ function appendRowSafely(path, row) {
15815
+ try {
15816
+ appendJsonl(path, row);
15817
+ } catch (err) {
15818
+ console.warn(
15819
+ "[eval] persist append failed (ignored):",
15820
+ err instanceof Error ? err.message : err
15821
+ );
15822
+ }
15823
+ }
15824
+ function makeRowSink(persist, classify) {
15825
+ const doneKeys = persist !== void 0 ? computeDoneKeys(persist) : /* @__PURE__ */ new Set();
15826
+ return {
15827
+ isResumed(entry, index) {
15828
+ if (persist === void 0 || doneKeys.size === 0) return false;
15829
+ return doneKeys.has(persist.key(probeRow(entry, index)));
15830
+ },
15831
+ finalize(row) {
15832
+ const outcome = classify?.(row);
15833
+ const finalRow = outcome !== void 0 ? { ...row, outcome } : row;
15834
+ if (persist !== void 0) appendRowSafely(persist.path, finalRow);
15835
+ return finalRow;
15836
+ }
15837
+ };
15838
+ }
15685
15839
  async function applyScorer(scorer, output, expected) {
15686
15840
  let raw;
15687
15841
  try {
@@ -15727,7 +15881,15 @@ function makeAgentForBatch(spec, _entries) {
15727
15881
  }
15728
15882
  return spec;
15729
15883
  }
15730
- async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow) {
15884
+ async function runManualSlot(idx, entries, spec, scorers, sink, rows, onRow) {
15885
+ const entry = entries[idx];
15886
+ if (entry === void 0) return;
15887
+ if (sink.isResumed(entry, idx)) return;
15888
+ const row = sink.finalize(await runOneEntry(spec, entry, idx, scorers));
15889
+ rows[idx] = row;
15890
+ onRow(row, idx);
15891
+ }
15892
+ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRow, sink) {
15731
15893
  const rows = new Array(entries.length);
15732
15894
  const state2 = { cursor: 0 };
15733
15895
  const worker = async () => {
@@ -15735,11 +15897,7 @@ async function runRowsManually(entries, spec, scorers, concurrency, signal, onRo
15735
15897
  if (signal?.aborted === true) return;
15736
15898
  const idx = state2.cursor;
15737
15899
  state2.cursor += 1;
15738
- const entry = entries[idx];
15739
- if (entry === void 0) continue;
15740
- const row = await runOneEntry(spec, entry, idx, scorers);
15741
- rows[idx] = row;
15742
- onRow(row, idx);
15900
+ await runManualSlot(idx, entries, spec, scorers, sink, rows, onRow);
15743
15901
  }
15744
15902
  };
15745
15903
  const workers = Array.from({ length: Math.min(concurrency, entries.length) }, () => worker());
@@ -15789,23 +15947,32 @@ async function scoreBatchOutput(br, expected, scorers) {
15789
15947
  }
15790
15948
  return scoreEntries;
15791
15949
  }
15792
- async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow) {
15793
- const prompts = entries.map((e) => e.input);
15950
+ async function runRowsViaBatch(entries, agentOptions, scorers, concurrency, signal, onRow, sink) {
15951
+ const pending = [];
15952
+ for (let i = 0; i < entries.length; i += 1) {
15953
+ const entry = entries[i];
15954
+ if (entry === void 0) continue;
15955
+ if (sink.isResumed(entry, i)) continue;
15956
+ pending.push({ entry, index: i });
15957
+ }
15794
15958
  const batchOpts = {
15795
15959
  ...agentOptions,
15796
15960
  concurrency,
15797
15961
  ...signal !== void 0 ? { signal } : {}
15798
15962
  };
15799
- const batchResults = await getAgentFacade().batch(prompts, batchOpts);
15963
+ const batchResults = await getAgentFacade().batch(
15964
+ pending.map((p) => p.entry.input),
15965
+ batchOpts
15966
+ );
15800
15967
  const rows = [];
15801
15968
  for (let i = 0; i < batchResults.length; i += 1) {
15802
- const entry = entries[i];
15969
+ const slot = pending[i];
15803
15970
  const br = batchResults[i];
15804
- if (entry === void 0 || br === void 0) continue;
15805
- const scoreEntries = await scoreBatchOutput(br, entry.expected, scorers);
15806
- const row = rowFromBatchResult(entry, br, scoreEntries, i);
15971
+ if (slot === void 0 || br === void 0) continue;
15972
+ const scoreEntries = await scoreBatchOutput(br, slot.entry.expected, scorers);
15973
+ const row = sink.finalize(rowFromBatchResult(slot.entry, br, scoreEntries, slot.index));
15807
15974
  rows.push(row);
15808
- onRow(row, i);
15975
+ onRow(row, slot.index);
15809
15976
  }
15810
15977
  return rows;
15811
15978
  }
@@ -15830,6 +15997,7 @@ async function runEval(options, runOpts) {
15830
15997
  const onRow = (row, i) => {
15831
15998
  safeHook(() => hooks?.afterRow?.(row, i));
15832
15999
  };
16000
+ const sink = makeRowSink(runOpts?.persist, runOpts?.classify);
15833
16001
  let rows;
15834
16002
  if (isAgentInstance(options.agent) || typeof options.agent === "function") {
15835
16003
  rows = await runRowsManually(
@@ -15838,11 +16006,12 @@ async function runEval(options, runOpts) {
15838
16006
  scorers,
15839
16007
  concurrency,
15840
16008
  signal,
15841
- onRow
16009
+ onRow,
16010
+ sink
15842
16011
  );
15843
16012
  } else {
15844
16013
  const batchOpts = makeAgentForBatch(options.agent, indexed);
15845
- rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow);
16014
+ rows = await runRowsViaBatch(indexed, batchOpts, scorers, concurrency, signal, onRow, sink);
15846
16015
  }
15847
16016
  const aggregate = computeAggregate(rows);
15848
16017
  const endedAt = Date.now();
@@ -15868,6 +16037,25 @@ async function runEval(options, runOpts) {
15868
16037
  }
15869
16038
  }
15870
16039
 
16040
+ // src/sandbox/shell-escape.ts
16041
+ function shellEscapePosix(arg) {
16042
+ return `'${arg.replace(/'/g, "'\\''")}'`;
16043
+ }
16044
+
16045
+ // src/internal/eval/code-runner.ts
16046
+ var ARTIFACT_PATCH = ".theo-artifact.patch";
16047
+ async function captureArtifact(sandbox, repoDir) {
16048
+ const dir = shellEscapePosix(repoDir);
16049
+ const diffRes = await sandbox.execute(`git -C ${dir} diff`);
16050
+ const diff = diffRes.stdout;
16051
+ if (diff.length === 0) return { diff: "", applies: false };
16052
+ await sandbox.uploadFile(`${repoDir}/${ARTIFACT_PATCH}`, diff);
16053
+ const check = await sandbox.execute(
16054
+ `git -C ${dir} apply --check --reverse ${shellEscapePosix(ARTIFACT_PATCH)}`
16055
+ );
16056
+ return { diff, applies: check.exitCode === 0 };
16057
+ }
16058
+
15871
16059
  // src/internal/scorers/llm-judge.ts
15872
16060
  init_agent_factory_registry();
15873
16061
  function buildPrompt(subject, criteria, rubric, expected) {
@@ -16010,6 +16198,38 @@ var Scorers = {
16010
16198
  }
16011
16199
  };
16012
16200
  },
16201
+ /**
16202
+ * Verify-gate scorer (M6-2): runs the project's tests in the provisioned
16203
+ * repo via `SandboxBackend.execute` and scores `1` iff the command exits `0`,
16204
+ * else `0` with the exit code + truncated stderr in `reason`. Grades the
16205
+ * artifact captured by `captureArtifact` (D2 — rides `execute`, never a
16206
+ * direct `child_process`).
16207
+ *
16208
+ * SECURITY: `command` is REQUIRED and the caller's builder owns shell-safety
16209
+ * of the (potentially untrusted, dataset-derived) test identifiers. There is
16210
+ * NO default that runs bare test names — that would interpolate untrusted
16211
+ * `failToPass`/`passToPass` straight into a shell. `repoDir` is shell-escaped
16212
+ * by the SDK; the test list is the builder's responsibility to render safely.
16213
+ *
16214
+ * PORTABILITY: the command is wrapped as `cd <repoDir> && <cmd>`, which
16215
+ * assumes a shell-backed `SandboxBackend` (LocalSandbox/Docker). A backend
16216
+ * that rejects shell metacharacters in `execute` is unsupported for this scorer.
16217
+ */
16218
+ verifyGate(opts) {
16219
+ const { sandbox, repoDir, failToPass, passToPass, command } = opts;
16220
+ return {
16221
+ name: "verify-gate",
16222
+ score: async () => {
16223
+ const cmd = command([...failToPass, ...passToPass]).trim();
16224
+ if (cmd.length === 0) {
16225
+ return { score: 0, reason: "verify_gate_empty_command" };
16226
+ }
16227
+ const r = await sandbox.execute(`cd ${shellEscapePosix(repoDir)} && ${cmd}`);
16228
+ if (r.exitCode === 0) return { score: 1 };
16229
+ return { score: 0, reason: `exit=${r.exitCode} ${r.stderr.slice(0, 200)}`.trim() };
16230
+ }
16231
+ };
16232
+ },
16013
16233
  jsonShape(schema, opts = {}) {
16014
16234
  return {
16015
16235
  name: "json-shape",
@@ -16084,6 +16304,9 @@ var Eval = class _Eval {
16084
16304
 
16085
16305
  exports.Eval = Eval;
16086
16306
  exports.EvalAlreadyRunningError = EvalAlreadyRunningError;
16307
+ exports.JsonlParseError = JsonlParseError;
16087
16308
  exports.Scorers = Scorers;
16309
+ exports.captureArtifact = captureArtifact;
16310
+ exports.loadJsonl = loadJsonl;
16088
16311
  //# sourceMappingURL=eval.cjs.map
16089
16312
  //# sourceMappingURL=eval.cjs.map