@alis-build/harness-eval 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +187 -30
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/claude-code/index.js +2 -1
- package/dist/adapters/codex/index.d.ts +68 -0
- package/dist/adapters/codex/index.js +3 -0
- package/dist/{claude-code-DZ4Vkgp6.js → claude-code-C_7hxC8z.js} +3 -245
- package/dist/claude-code-C_7hxC8z.js.map +1 -0
- package/dist/cli/bin.js +131 -151
- package/dist/cli/bin.js.map +1 -1
- package/dist/codex-0cHO2te9.js +496 -0
- package/dist/codex-0cHO2te9.js.map +1 -0
- package/dist/config/loader.d.ts +2 -2
- package/dist/config/loader.js +2 -2
- package/dist/{index-V22PrR0p.d.ts → index-C56AEDUr.d.ts} +2 -2
- package/dist/index.d.ts +134 -6
- package/dist/index.js +6 -5
- package/dist/index.js.map +1 -1
- package/dist/{loader-DcI0KfRX.js → loader-CiBm4Kf6.js} +491 -209
- package/dist/loader-CiBm4Kf6.js.map +1 -0
- package/dist/loader-CrmzNwkq.d.ts +107 -0
- package/dist/{projections-BcX7w-f6.js → reporter-BKCJZRYr.js} +1475 -729
- package/dist/reporter-BKCJZRYr.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-Dlzl-HI0.js → suite-C3-8EjUW.js} +558 -4
- package/dist/suite-C3-8EjUW.js.map +1 -0
- package/dist/{suite-DPJMIEbu.d.ts → suite-qyOGre2g.d.ts} +2 -2
- package/dist/types-Bac8_Ixb.js +246 -0
- package/dist/types-Bac8_Ixb.js.map +1 -0
- package/dist/{types-CD3TwOtZ.d.ts → types-CLt4Yygc.d.ts} +2 -2
- package/dist/{types-B9H4IZtA.d.ts → types-D0HR2WnP.d.ts} +9 -2
- package/dist/types-DFMpv_HJ.d.ts +77 -0
- package/package.json +11 -2
- package/schemas/eval-run-envelope.schema.json +193 -183
- package/dist/claude-code-DZ4Vkgp6.js.map +0 -1
- package/dist/loader-C9yQHUPC.d.ts +0 -50
- package/dist/loader-DcI0KfRX.js.map +0 -1
- package/dist/projections-BcX7w-f6.js.map +0 -1
- package/dist/suite-Dlzl-HI0.js.map +0 -1
|
@@ -1,5 +1,11 @@
|
|
|
1
|
-
import { t as
|
|
1
|
+
import { n as TrajectoryBuilder, t as AdapterError } from "./types-Bac8_Ixb.js";
|
|
2
|
+
import { t as claudeCodeAdapter } from "./claude-code-C_7hxC8z.js";
|
|
2
3
|
import { n as parseCardinality, t as describeCardinality } from "./cardinality-DlE44e-4.js";
|
|
4
|
+
import { t as codexAdapter } from "./codex-0cHO2te9.js";
|
|
5
|
+
import { spawn } from "node:child_process";
|
|
6
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
7
|
+
import { tmpdir } from "node:os";
|
|
8
|
+
import { join } from "node:path";
|
|
3
9
|
//#region src/assertions/patterns.ts
|
|
4
10
|
/**
|
|
5
11
|
* Test whether a fully-qualified tool name matches a pattern.
|
|
@@ -454,12 +460,536 @@ function evaluateAll(view, assertions) {
|
|
|
454
460
|
return assertions.map((a) => evaluate(view, a));
|
|
455
461
|
}
|
|
456
462
|
//#endregion
|
|
463
|
+
//#region src/adapters/gemini-cli/exit-codes.ts
|
|
464
|
+
/**
|
|
465
|
+
* Known Gemini CLI exit codes for headless / stream-json runs.
|
|
466
|
+
*
|
|
467
|
+
* @see spec P-7 — preserve codes in diagnostics and surface human-readable labels.
|
|
468
|
+
*/
|
|
469
|
+
/** Documented Gemini CLI exit codes for headless harness runs (spec P-7). */
|
|
470
|
+
const GEMINI_CLI_EXIT_CODES = {
|
|
471
|
+
/** Normal completion. */
|
|
472
|
+
SUCCESS: 0,
|
|
473
|
+
/** Unhandled CLI or runtime failure. */
|
|
474
|
+
ERROR: 1,
|
|
475
|
+
/** Invalid prompt, flags, or stdin (exit 42). */
|
|
476
|
+
INPUT_ERROR: 42,
|
|
477
|
+
/** Agent exceeded configured turn budget (exit 53). */
|
|
478
|
+
TURN_LIMIT: 53
|
|
479
|
+
};
|
|
480
|
+
/**
|
|
481
|
+
* Return a short description for a non-zero Gemini CLI exit code.
|
|
482
|
+
*
|
|
483
|
+
* Used to populate {@link AdapterDiagnostics.exitCodeDescription} so reports
|
|
484
|
+
* surface human-readable failure reasons without re-parsing stderr.
|
|
485
|
+
*/
|
|
486
|
+
function describeGeminiCliExitCode(exitCode) {
|
|
487
|
+
if (exitCode === null || exitCode === GEMINI_CLI_EXIT_CODES.SUCCESS) return;
|
|
488
|
+
switch (exitCode) {
|
|
489
|
+
case GEMINI_CLI_EXIT_CODES.ERROR: return "Gemini CLI exited with a general error (code 1)";
|
|
490
|
+
case GEMINI_CLI_EXIT_CODES.INPUT_ERROR: return "Gemini CLI input error (code 42)";
|
|
491
|
+
case GEMINI_CLI_EXIT_CODES.TURN_LIMIT: return "Gemini CLI turn limit exceeded (code 53)";
|
|
492
|
+
default: return `Gemini CLI exited with code ${exitCode}`;
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
//#endregion
|
|
496
|
+
//#region src/adapters/gemini-cli/map-events.ts
|
|
497
|
+
/** Stateful mapper — tracks session id, delta text, and pending tool calls. */
|
|
498
|
+
var GeminiCliEventMapper = class {
|
|
499
|
+
sessionId = "gemini-session";
|
|
500
|
+
model = "";
|
|
501
|
+
sawInit = false;
|
|
502
|
+
startedTools = /* @__PURE__ */ new Set();
|
|
503
|
+
assistantDeltaBuffer = "";
|
|
504
|
+
turnCount = 0;
|
|
505
|
+
/** Map one parsed Gemini JSON object to zero or more stream events. */
|
|
506
|
+
map(event) {
|
|
507
|
+
const type = event.type;
|
|
508
|
+
if (!type) return [];
|
|
509
|
+
switch (type) {
|
|
510
|
+
case "init": return [this.buildInit(event.session_id ?? this.sessionId, event.model ?? "")];
|
|
511
|
+
case "message": return this.mapMessage(event);
|
|
512
|
+
case "tool_use": return this.mapToolUse(event);
|
|
513
|
+
case "tool_result": return this.mapToolResult(event);
|
|
514
|
+
case "error": return [];
|
|
515
|
+
case "result": return this.mapResult(event);
|
|
516
|
+
default: return [];
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
buildInit(sessionId, model) {
|
|
520
|
+
this.sessionId = sessionId;
|
|
521
|
+
this.model = model;
|
|
522
|
+
this.sawInit = true;
|
|
523
|
+
return {
|
|
524
|
+
type: "system",
|
|
525
|
+
subtype: "init",
|
|
526
|
+
session_id: sessionId,
|
|
527
|
+
cwd: "",
|
|
528
|
+
model,
|
|
529
|
+
tools: [],
|
|
530
|
+
mcp_servers: []
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
ensureInit() {
|
|
534
|
+
if (this.sawInit) return [];
|
|
535
|
+
return [this.buildInit(this.sessionId, this.model)];
|
|
536
|
+
}
|
|
537
|
+
mapMessage(event) {
|
|
538
|
+
if (event.role === "user") return [];
|
|
539
|
+
const chunk = event.content ?? "";
|
|
540
|
+
if (event.delta) {
|
|
541
|
+
this.assistantDeltaBuffer += chunk;
|
|
542
|
+
return this.ensureInit();
|
|
543
|
+
}
|
|
544
|
+
const text = this.assistantDeltaBuffer + chunk;
|
|
545
|
+
this.assistantDeltaBuffer = "";
|
|
546
|
+
if (!text) return this.ensureInit();
|
|
547
|
+
return [...this.ensureInit(), {
|
|
548
|
+
type: "assistant",
|
|
549
|
+
session_id: this.sessionId,
|
|
550
|
+
message: {
|
|
551
|
+
id: `msg_${this.turnCount}`,
|
|
552
|
+
type: "message",
|
|
553
|
+
role: "assistant",
|
|
554
|
+
content: [{
|
|
555
|
+
type: "text",
|
|
556
|
+
text
|
|
557
|
+
}],
|
|
558
|
+
stop_reason: "end_turn"
|
|
559
|
+
}
|
|
560
|
+
}];
|
|
561
|
+
}
|
|
562
|
+
mapToolUse(event) {
|
|
563
|
+
const toolId = event.tool_id ?? `tool_${Math.random().toString(36).slice(2)}`;
|
|
564
|
+
if (event.tool_id) this.startedTools.add(event.tool_id);
|
|
565
|
+
const name = resolveGeminiToolName(event.tool_name ?? "unknown", event.parameters ?? {});
|
|
566
|
+
return [
|
|
567
|
+
...this.flushDeltaBuffer(),
|
|
568
|
+
...this.ensureInit(),
|
|
569
|
+
{
|
|
570
|
+
type: "assistant",
|
|
571
|
+
session_id: this.sessionId,
|
|
572
|
+
message: {
|
|
573
|
+
id: `assistant_${toolId}`,
|
|
574
|
+
type: "message",
|
|
575
|
+
role: "assistant",
|
|
576
|
+
content: [{
|
|
577
|
+
type: "tool_use",
|
|
578
|
+
id: toolId,
|
|
579
|
+
name,
|
|
580
|
+
input: event.parameters ?? {}
|
|
581
|
+
}],
|
|
582
|
+
stop_reason: "tool_use"
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
];
|
|
586
|
+
}
|
|
587
|
+
mapToolResult(event) {
|
|
588
|
+
const toolId = event.tool_id ?? "";
|
|
589
|
+
const events = [...this.ensureInit()];
|
|
590
|
+
if (toolId && !this.startedTools.has(toolId)) events.push({
|
|
591
|
+
type: "assistant",
|
|
592
|
+
session_id: this.sessionId,
|
|
593
|
+
message: {
|
|
594
|
+
id: `assistant_${toolId}`,
|
|
595
|
+
type: "message",
|
|
596
|
+
role: "assistant",
|
|
597
|
+
content: [{
|
|
598
|
+
type: "tool_use",
|
|
599
|
+
id: toolId,
|
|
600
|
+
name: "unknown",
|
|
601
|
+
input: {}
|
|
602
|
+
}],
|
|
603
|
+
stop_reason: "tool_use"
|
|
604
|
+
}
|
|
605
|
+
});
|
|
606
|
+
else if (toolId) this.startedTools.delete(toolId);
|
|
607
|
+
const isError = event.status === "error" || event.error != null;
|
|
608
|
+
const content = event.output ?? event.error?.message ?? "";
|
|
609
|
+
events.push({
|
|
610
|
+
type: "user",
|
|
611
|
+
session_id: this.sessionId,
|
|
612
|
+
message: {
|
|
613
|
+
role: "user",
|
|
614
|
+
content: [{
|
|
615
|
+
type: "tool_result",
|
|
616
|
+
tool_use_id: toolId,
|
|
617
|
+
content,
|
|
618
|
+
is_error: isError
|
|
619
|
+
}]
|
|
620
|
+
}
|
|
621
|
+
});
|
|
622
|
+
return events;
|
|
623
|
+
}
|
|
624
|
+
mapResult(event) {
|
|
625
|
+
this.turnCount++;
|
|
626
|
+
const isError = event.status === "error";
|
|
627
|
+
return [...this.flushDeltaBuffer(), {
|
|
628
|
+
type: "result",
|
|
629
|
+
subtype: isError ? "error" : "success",
|
|
630
|
+
session_id: this.sessionId,
|
|
631
|
+
is_error: isError,
|
|
632
|
+
result: event.error?.message ?? "",
|
|
633
|
+
usage: mapUsage(event.stats),
|
|
634
|
+
total_cost_usd: event.stats?.total_cost_usd ?? 0,
|
|
635
|
+
duration_ms: event.stats?.duration_ms ?? 0,
|
|
636
|
+
num_turns: this.turnCount
|
|
637
|
+
}];
|
|
638
|
+
}
|
|
639
|
+
/** Emit buffered delta text as one assistant message before tool/result events. */
|
|
640
|
+
flushDeltaBuffer() {
|
|
641
|
+
if (!this.assistantDeltaBuffer) return [];
|
|
642
|
+
const text = this.assistantDeltaBuffer;
|
|
643
|
+
this.assistantDeltaBuffer = "";
|
|
644
|
+
return [...this.ensureInit(), {
|
|
645
|
+
type: "assistant",
|
|
646
|
+
session_id: this.sessionId,
|
|
647
|
+
message: {
|
|
648
|
+
id: `msg_delta_${this.turnCount}`,
|
|
649
|
+
type: "message",
|
|
650
|
+
role: "assistant",
|
|
651
|
+
content: [{
|
|
652
|
+
type: "text",
|
|
653
|
+
text
|
|
654
|
+
}],
|
|
655
|
+
stop_reason: "end_turn"
|
|
656
|
+
}
|
|
657
|
+
}];
|
|
658
|
+
}
|
|
659
|
+
};
|
|
660
|
+
/**
|
|
661
|
+
* Resolve harness tool name from Gemini tool_name + parameters.
|
|
662
|
+
*
|
|
663
|
+
* MCP tools use `mcp__<server>__<tool>`; built-in Gemini tools keep native names.
|
|
664
|
+
*/
|
|
665
|
+
function resolveGeminiToolName(toolName, parameters) {
|
|
666
|
+
if (toolName.startsWith("mcp__")) return toolName;
|
|
667
|
+
const server = typeof parameters.server === "string" ? parameters.server : void 0;
|
|
668
|
+
const tool = typeof parameters.tool === "string" ? parameters.tool : void 0;
|
|
669
|
+
if (server && tool) return `mcp__${server}__${tool}`;
|
|
670
|
+
if (toolName.startsWith("mcp_") && !toolName.startsWith("mcp__")) {
|
|
671
|
+
const rest = toolName.slice(4);
|
|
672
|
+
const separator = rest.lastIndexOf("_");
|
|
673
|
+
if (separator > 0) {
|
|
674
|
+
const geminiServer = rest.slice(0, separator);
|
|
675
|
+
const geminiTool = rest.slice(separator + 1);
|
|
676
|
+
if (geminiServer && geminiTool) return `mcp__${geminiServer}__${geminiTool}`;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
return toolName;
|
|
680
|
+
}
|
|
681
|
+
function mapUsage(stats) {
|
|
682
|
+
return {
|
|
683
|
+
input_tokens: stats?.input_tokens ?? 0,
|
|
684
|
+
output_tokens: stats?.output_tokens ?? 0
|
|
685
|
+
};
|
|
686
|
+
}
|
|
687
|
+
//#endregion
|
|
688
|
+
//#region src/adapters/gemini-cli/parse-json.ts
|
|
689
|
+
/** Parse Gemini JSONL stdout into parsed event objects. */
|
|
690
|
+
async function* parseGeminiCliJson(stream) {
|
|
691
|
+
let buffer = "";
|
|
692
|
+
stream.setEncoding("utf8");
|
|
693
|
+
for await (const chunk of stream) {
|
|
694
|
+
buffer += chunk;
|
|
695
|
+
let newlineIdx;
|
|
696
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
697
|
+
const line = buffer.slice(0, newlineIdx).trim();
|
|
698
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
699
|
+
if (line.length === 0) continue;
|
|
700
|
+
yield tryParseLine(line);
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
const trailing = buffer.trim();
|
|
704
|
+
if (trailing.length > 0) yield tryParseLine(trailing);
|
|
705
|
+
}
|
|
706
|
+
function tryParseLine(line) {
|
|
707
|
+
try {
|
|
708
|
+
return {
|
|
709
|
+
ok: true,
|
|
710
|
+
event: JSON.parse(line),
|
|
711
|
+
rawLine: line
|
|
712
|
+
};
|
|
713
|
+
} catch (err) {
|
|
714
|
+
return {
|
|
715
|
+
ok: false,
|
|
716
|
+
error: err instanceof Error ? err : new Error(String(err)),
|
|
717
|
+
rawLine: line
|
|
718
|
+
};
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
//#endregion
|
|
722
|
+
//#region src/adapters/gemini-cli/flags.ts
|
|
723
|
+
function pushRepeatableFlag(args, flag, values) {
|
|
724
|
+
if (!values) return;
|
|
725
|
+
for (const value of values) args.push(flag, value);
|
|
726
|
+
}
|
|
727
|
+
function pushOptionalFlag(args, flag, value) {
|
|
728
|
+
if (value === void 0) return;
|
|
729
|
+
if (typeof value === "boolean") {
|
|
730
|
+
if (value) args.push(flag);
|
|
731
|
+
return;
|
|
732
|
+
}
|
|
733
|
+
args.push(flag, String(value));
|
|
734
|
+
}
|
|
735
|
+
/**
|
|
736
|
+
* Append shared Gemini CLI flags (excluding prompt and output format).
|
|
737
|
+
*
|
|
738
|
+
* Harness and judge subprocesses share this helper so both paths stay aligned
|
|
739
|
+
* on approval mode, sandbox, MCP allowlists, and trust settings.
|
|
740
|
+
*/
|
|
741
|
+
function appendGeminiCliFlags(args, config) {
|
|
742
|
+
pushOptionalFlag(args, "--approval-mode", config.approvalMode ?? "yolo");
|
|
743
|
+
pushOptionalFlag(args, "--model", config.model);
|
|
744
|
+
pushOptionalFlag(args, "--sandbox", config.sandbox);
|
|
745
|
+
pushOptionalFlag(args, "--skip-trust", config.skipTrust);
|
|
746
|
+
pushRepeatableFlag(args, "--include-directories", config.includeDirectories);
|
|
747
|
+
pushRepeatableFlag(args, "--allowed-mcp-server-names", config.allowedMcpServerNames);
|
|
748
|
+
pushRepeatableFlag(args, "--extensions", config.extensions);
|
|
749
|
+
pushOptionalFlag(args, "--debug", config.debug);
|
|
750
|
+
}
|
|
751
|
+
/**
|
|
752
|
+
* Build argv for `gemini -p "<prompt>" --output-format stream-json …`.
|
|
753
|
+
*
|
|
754
|
+
* Prompt is passed via `-p` and must remain the final positional argument
|
|
755
|
+
* after all flags. Defaults `skipTrust` to true so CI and ephemeral workspaces
|
|
756
|
+
* do not block on interactive folder-trust prompts.
|
|
757
|
+
*/
|
|
758
|
+
function buildArgs(config) {
|
|
759
|
+
const args = [
|
|
760
|
+
"-p",
|
|
761
|
+
config.prompt,
|
|
762
|
+
"--output-format",
|
|
763
|
+
"stream-json"
|
|
764
|
+
];
|
|
765
|
+
appendGeminiCliFlags(args, {
|
|
766
|
+
...config,
|
|
767
|
+
skipTrust: config.skipTrust ?? true
|
|
768
|
+
});
|
|
769
|
+
return args;
|
|
770
|
+
}
|
|
771
|
+
/**
|
|
772
|
+
* Build argv for `gemini -p "<prompt>" --output-format json …` (judge).
|
|
773
|
+
*
|
|
774
|
+
* Emits a single JSON object (not NDJSON). The judge grader may read it from
|
|
775
|
+
* stdout or recover it from stderr when Gemini prints warnings first.
|
|
776
|
+
*/
|
|
777
|
+
function buildJudgeArgs(prompt, config = {}) {
|
|
778
|
+
const args = [
|
|
779
|
+
"-p",
|
|
780
|
+
prompt,
|
|
781
|
+
"--output-format",
|
|
782
|
+
"json"
|
|
783
|
+
];
|
|
784
|
+
appendGeminiCliFlags(args, {
|
|
785
|
+
...config,
|
|
786
|
+
approvalMode: config.approvalMode ?? "yolo",
|
|
787
|
+
skipTrust: config.skipTrust ?? true
|
|
788
|
+
});
|
|
789
|
+
return args;
|
|
790
|
+
}
|
|
791
|
+
//#endregion
|
|
792
|
+
//#region src/adapters/gemini-cli/process.ts
|
|
793
|
+
/**
|
|
794
|
+
* Process management for the Gemini CLI adapter.
|
|
795
|
+
*
|
|
796
|
+
* Spawns `gemini -p … --output-format stream-json`, handles timeout/abort,
|
|
797
|
+
* and optional config-directory isolation.
|
|
798
|
+
*/
|
|
799
|
+
const DEFAULT_TIMEOUT_MS = 300 * 1e3;
|
|
800
|
+
/** Grace period between SIGTERM and SIGKILL on timeout or abort. */
|
|
801
|
+
const KILL_GRACE_MS = 5e3;
|
|
802
|
+
/** Env var Gemini CLI uses for config directory isolation. */
|
|
803
|
+
const GEMINI_CONFIG_DIR_ENV = "GEMINI_CONFIG_DIR";
|
|
804
|
+
/** Resolve config dir for isolated runs. Exported for unit tests. */
|
|
805
|
+
function resolveGeminiConfigDir(config, tempDir) {
|
|
806
|
+
if (config.isolateConfig !== true || !tempDir) return void 0;
|
|
807
|
+
return tempDir;
|
|
808
|
+
}
|
|
809
|
+
/**
|
|
810
|
+
* Build subprocess env with optional `GEMINI_CONFIG_DIR` isolation.
|
|
811
|
+
*
|
|
812
|
+
* Shared by harness spawn ({@link spawnGeminiCli}) and the Gemini CLI judge
|
|
813
|
+
* ({@link runGeminiCliGrader}) so both paths use the same config-dir semantics.
|
|
814
|
+
*/
|
|
815
|
+
async function prepareGeminiCliEnv(config, baseEnv = process.env) {
|
|
816
|
+
const tempConfigDir = config.isolateConfig === true ? await mkdtemp(join(tmpdir(), "harness-eval-gemini-")) : null;
|
|
817
|
+
const env = {
|
|
818
|
+
...baseEnv,
|
|
819
|
+
...config.env
|
|
820
|
+
};
|
|
821
|
+
const configDir = resolveGeminiConfigDir(config, tempConfigDir);
|
|
822
|
+
if (configDir) env[GEMINI_CONFIG_DIR_ENV] = configDir;
|
|
823
|
+
const cleanup = async () => {
|
|
824
|
+
if (!tempConfigDir) return;
|
|
825
|
+
try {
|
|
826
|
+
await rm(tempConfigDir, {
|
|
827
|
+
recursive: true,
|
|
828
|
+
force: true
|
|
829
|
+
});
|
|
830
|
+
} catch {}
|
|
831
|
+
};
|
|
832
|
+
return {
|
|
833
|
+
env,
|
|
834
|
+
cleanup
|
|
835
|
+
};
|
|
836
|
+
}
|
|
837
|
+
/**
|
|
838
|
+
* Spawn `gemini -p … --output-format stream-json` with optional config-dir isolation.
|
|
839
|
+
*
|
|
840
|
+
* Timeout and abort both send SIGTERM to the process group, then SIGKILL after
|
|
841
|
+
* {@link KILL_GRACE_MS} if the group is still alive.
|
|
842
|
+
*/
|
|
843
|
+
async function spawnGeminiCli(config) {
|
|
844
|
+
const binary = config.binary ?? "gemini";
|
|
845
|
+
const args = buildArgs(config);
|
|
846
|
+
const { env, cleanup: envCleanup } = await prepareGeminiCliEnv(config);
|
|
847
|
+
const child = spawn(binary, args, {
|
|
848
|
+
cwd: config.cwd ?? process.cwd(),
|
|
849
|
+
env,
|
|
850
|
+
stdio: [
|
|
851
|
+
"ignore",
|
|
852
|
+
"pipe",
|
|
853
|
+
"pipe"
|
|
854
|
+
],
|
|
855
|
+
detached: true
|
|
856
|
+
});
|
|
857
|
+
let timedOut = false;
|
|
858
|
+
let killEscalation = null;
|
|
859
|
+
const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
860
|
+
const scheduleKillEscalation = () => {
|
|
861
|
+
if (killEscalation) clearTimeout(killEscalation);
|
|
862
|
+
killEscalation = setTimeout(() => killTree(child, "SIGKILL"), KILL_GRACE_MS);
|
|
863
|
+
};
|
|
864
|
+
const timeoutTimer = setTimeout(() => {
|
|
865
|
+
timedOut = true;
|
|
866
|
+
killTree(child, "SIGTERM");
|
|
867
|
+
scheduleKillEscalation();
|
|
868
|
+
}, timeoutMs);
|
|
869
|
+
const onAbort = () => {
|
|
870
|
+
killTree(child, "SIGTERM");
|
|
871
|
+
scheduleKillEscalation();
|
|
872
|
+
};
|
|
873
|
+
config.signal?.addEventListener("abort", onAbort, { once: true });
|
|
874
|
+
const stderrChunks = [];
|
|
875
|
+
child.stderr?.setEncoding("utf8");
|
|
876
|
+
child.stderr?.on("data", (chunk) => {
|
|
877
|
+
stderrChunks.push(chunk);
|
|
878
|
+
});
|
|
879
|
+
const stderrCollected = new Promise((resolve) => {
|
|
880
|
+
const finalize = () => resolve(stderrChunks.join(""));
|
|
881
|
+
child.stderr?.on("end", finalize);
|
|
882
|
+
child.stderr?.on("error", finalize);
|
|
883
|
+
});
|
|
884
|
+
const done = new Promise((resolve) => {
|
|
885
|
+
let settled = false;
|
|
886
|
+
const finalize = (exitCode, signal) => {
|
|
887
|
+
if (settled) return;
|
|
888
|
+
settled = true;
|
|
889
|
+
clearTimeout(timeoutTimer);
|
|
890
|
+
if (killEscalation) clearTimeout(killEscalation);
|
|
891
|
+
config.signal?.removeEventListener("abort", onAbort);
|
|
892
|
+
resolve({
|
|
893
|
+
exitCode,
|
|
894
|
+
signal
|
|
895
|
+
});
|
|
896
|
+
};
|
|
897
|
+
child.on("close", (code, signal) => finalize(code, signal));
|
|
898
|
+
child.on("error", () => finalize(null, null));
|
|
899
|
+
});
|
|
900
|
+
return {
|
|
901
|
+
stdout: child.stdout,
|
|
902
|
+
done,
|
|
903
|
+
stderrCollected,
|
|
904
|
+
timedOut: () => timedOut,
|
|
905
|
+
cleanup: envCleanup
|
|
906
|
+
};
|
|
907
|
+
}
|
|
908
|
+
function killTree(child, signal) {
|
|
909
|
+
if (child.pid === void 0) return;
|
|
910
|
+
try {
|
|
911
|
+
process.kill(-child.pid, signal);
|
|
912
|
+
} catch {
|
|
913
|
+
try {
|
|
914
|
+
child.kill(signal);
|
|
915
|
+
} catch {}
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
//#endregion
|
|
919
|
+
//#region src/adapters/gemini-cli/index.ts
|
|
920
|
+
/**
|
|
921
|
+
* Gemini CLI adapter — public API.
|
|
922
|
+
*/
|
|
923
|
+
/**
|
|
924
|
+
* Run Gemini CLI in headless stream-json mode and return a trajectory.
|
|
925
|
+
*
|
|
926
|
+
* Maps NDJSON events through {@link GeminiCliEventMapper}, records parse
|
|
927
|
+
* errors without aborting, and attaches {@link AdapterDiagnostics.exitCodeDescription}
|
|
928
|
+
* for known non-zero exit codes (spec P-7).
|
|
929
|
+
*/
|
|
930
|
+
async function runGeminiCli(config) {
|
|
931
|
+
const startTs = Date.now();
|
|
932
|
+
const spawned = await spawnGeminiCli(config);
|
|
933
|
+
const builder = new TrajectoryBuilder();
|
|
934
|
+
const mapper = new GeminiCliEventMapper();
|
|
935
|
+
const rawEvents = [];
|
|
936
|
+
const parseErrors = [];
|
|
937
|
+
try {
|
|
938
|
+
for await (const result of parseGeminiCliJson(spawned.stdout)) {
|
|
939
|
+
if (!result.ok) {
|
|
940
|
+
parseErrors.push({
|
|
941
|
+
line: result.rawLine,
|
|
942
|
+
error: result.error.message
|
|
943
|
+
});
|
|
944
|
+
continue;
|
|
945
|
+
}
|
|
946
|
+
for (const event of mapper.map(result.event)) {
|
|
947
|
+
builder.consume(event);
|
|
948
|
+
rawEvents.push(event);
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
const [{ exitCode, signal }, stderr] = await Promise.all([spawned.done, spawned.stderrCollected]);
|
|
952
|
+
const exitCodeDescription = describeGeminiCliExitCode(exitCode);
|
|
953
|
+
const diagnostics = {
|
|
954
|
+
exitCode,
|
|
955
|
+
exitCodeDescription,
|
|
956
|
+
signal,
|
|
957
|
+
stderr,
|
|
958
|
+
parseErrors,
|
|
959
|
+
timedOut: spawned.timedOut(),
|
|
960
|
+
durationMs: Date.now() - startTs
|
|
961
|
+
};
|
|
962
|
+
let view;
|
|
963
|
+
try {
|
|
964
|
+
view = builder.build();
|
|
965
|
+
} catch (err) {
|
|
966
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
967
|
+
const stderrHint = stderr.trim().length > 0 ? ` stderr: ${stderr.trim().slice(0, 400)}` : "";
|
|
968
|
+
throw new AdapterError(`harness produced no usable trajectory: ${message}${exitCodeDescription ? ` (${exitCodeDescription})` : ""}${stderrHint}`, diagnostics);
|
|
969
|
+
}
|
|
970
|
+
return {
|
|
971
|
+
view,
|
|
972
|
+
diagnostics,
|
|
973
|
+
rawEvents
|
|
974
|
+
};
|
|
975
|
+
} finally {
|
|
976
|
+
await spawned.cleanup();
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
/** Registered {@link HarnessAdapter} for Gemini CLI headless runs. */
|
|
980
|
+
const geminiCliAdapter = {
|
|
981
|
+
id: "gemini-cli",
|
|
982
|
+
run: runGeminiCli
|
|
983
|
+
};
|
|
984
|
+
//#endregion
|
|
457
985
|
//#region src/adapters/registry.ts
|
|
458
986
|
const ADAPTERS = {};
|
|
459
987
|
function registerBuiltIn(id, adapter) {
|
|
460
988
|
ADAPTERS[id] = adapter;
|
|
461
989
|
}
|
|
462
990
|
registerBuiltIn("claude-code", claudeCodeAdapter);
|
|
991
|
+
registerBuiltIn("codex", codexAdapter);
|
|
992
|
+
registerBuiltIn("gemini-cli", geminiCliAdapter);
|
|
463
993
|
/**
|
|
464
994
|
* Register a harness adapter by id.
|
|
465
995
|
*
|
|
@@ -505,12 +1035,36 @@ function toClaudeCodeConfig(layers, prompt) {
|
|
|
505
1035
|
merged.prompt = prompt;
|
|
506
1036
|
return merged;
|
|
507
1037
|
}
|
|
1038
|
+
/** Merge generic suite config layers into a flat {@link CodexAdapterConfig}. */
|
|
1039
|
+
function toCodexConfig(layers, prompt) {
|
|
1040
|
+
const merged = {};
|
|
1041
|
+
for (const layer of layers) {
|
|
1042
|
+
const { codex, ...generic } = layer;
|
|
1043
|
+
Object.assign(merged, generic);
|
|
1044
|
+
if (codex && typeof codex === "object") Object.assign(merged, codex);
|
|
1045
|
+
}
|
|
1046
|
+
merged.prompt = prompt;
|
|
1047
|
+
return merged;
|
|
1048
|
+
}
|
|
1049
|
+
/** Merge generic suite config layers into a flat {@link GeminiCliAdapterConfig}. */
|
|
1050
|
+
function toGeminiCliConfig(layers, prompt) {
|
|
1051
|
+
const merged = {};
|
|
1052
|
+
for (const layer of layers) {
|
|
1053
|
+
const { geminiCli, ...generic } = layer;
|
|
1054
|
+
Object.assign(merged, generic);
|
|
1055
|
+
if (geminiCli && typeof geminiCli === "object") Object.assign(merged, geminiCli);
|
|
1056
|
+
}
|
|
1057
|
+
merged.prompt = prompt;
|
|
1058
|
+
return merged;
|
|
1059
|
+
}
|
|
508
1060
|
/**
|
|
509
1061
|
* Resolve merged suite layers into the flat config shape expected by the
|
|
510
|
-
* selected harness adapter.
|
|
1062
|
+
* selected harness adapter (`claude-code`, `codex`, or `gemini-cli`).
|
|
511
1063
|
*/
|
|
512
1064
|
function resolveRunConfig(adapterId, layers, prompt) {
|
|
513
1065
|
if (adapterId === "claude-code" || adapterId === "claude-code") return toClaudeCodeConfig(layers, prompt);
|
|
1066
|
+
if (adapterId === "codex") return toCodexConfig(layers, prompt);
|
|
1067
|
+
if (adapterId === "gemini-cli") return toGeminiCliConfig(layers, prompt);
|
|
514
1068
|
const merged = {};
|
|
515
1069
|
for (const layer of layers) Object.assign(merged, layer);
|
|
516
1070
|
merged.prompt = prompt;
|
|
@@ -733,6 +1287,6 @@ async function runSuite(suite, options = {}) {
|
|
|
733
1287
|
return report;
|
|
734
1288
|
}
|
|
735
1289
|
//#endregion
|
|
736
|
-
export { aggregateCell as a, runRepetition as c, getDefaultAdapter as d, listAdapters as f,
|
|
1290
|
+
export { evaluateAll as _, aggregateCell as a, runRepetition as c, getDefaultAdapter as d, listAdapters as f, evaluate as g, buildJudgeArgs as h, DEFAULT_THRESHOLD as i, DEFAULT_ADAPTER_ID as l, prepareGeminiCliEnv as m, createLimit as n, getRepetitions as o, registerAdapter as p, DEFAULT_REPETITIONS as r, mergeConfig as s, runSuite as t, getAdapter as u };
|
|
737
1291
|
|
|
738
|
-
//# sourceMappingURL=suite-
|
|
1292
|
+
//# sourceMappingURL=suite-C3-8EjUW.js.map
|