agentgrader 1.0.5 → 1.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +220 -39
  2. package/package.json +7 -7
package/dist/index.js CHANGED
@@ -11,11 +11,17 @@ import { AiSdkAgentAdapter } from '@agentgrader/agent-openrouter';
11
11
  import { StaticQualityScorer } from '@agentgrader/scorer-static';
12
12
  import { expandMatrix, MatrixSchema, aggregateResults, paretoFront } from '@agentgrader/optimizer';
13
13
  import { jsx, jsxs } from 'react/jsx-runtime';
14
- import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync } from 'fs';
15
- import { stringify, parse } from 'yaml';
14
+ import { mkdirSync, writeFileSync, readFileSync, readdirSync, statSync, existsSync } from 'fs';
15
+ import { parse, stringify } from 'yaml';
16
16
  import { ZodError } from 'zod';
17
17
  import { execFileSync } from 'child_process';
18
18
 
19
+ var CONFIG_COL_WIDTH = 24;
20
+ var CONFIG_LABEL_MAX = 20;
21
+ function truncateLabel(name, max = CONFIG_LABEL_MAX) {
22
+ if (name.length <= max) return name;
23
+ return `${name.slice(0, max - 1)}\u2026`;
24
+ }
19
25
  var Dashboard = ({ runs, testCases, configs, isFinished }) => {
20
26
  let totalCost = 0;
21
27
  let totalSteps = 0;
@@ -67,7 +73,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
67
73
  r.testCaseId
68
74
  ] }),
69
75
  /* @__PURE__ */ jsx(Text, { color: "gray", children: " with " }),
70
- /* @__PURE__ */ jsx(Text, { color: "blue", children: r.agentConfigId }),
76
+ /* @__PURE__ */ jsx(Text, { color: "blue", wrap: "truncate-end", children: truncateLabel(r.agentConfigId) }),
71
77
  /* @__PURE__ */ jsxs(Text, { color: "gray", children: [
72
78
  " (Steps: ",
73
79
  r.stepsCount,
@@ -81,22 +87,22 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
81
87
  /* @__PURE__ */ jsxs(Box, { flexDirection: "column", borderStyle: "single", borderColor: "gray", padding: 1, children: [
82
88
  /* @__PURE__ */ jsxs(Box, { flexDirection: "row", marginBottom: 1, children: [
83
89
  /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "cyan", children: "Test Case" }) }),
84
- configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", children: cfg }) }, cfg))
90
+ configs.map((cfg) => /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { bold: true, color: "blue", wrap: "truncate-end", children: truncateLabel(cfg) }) }, cfg))
85
91
  ] }),
86
92
  testCases.map((tc) => /* @__PURE__ */ jsxs(Box, { flexDirection: "row", children: [
87
- /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { children: tc }) }),
93
+ /* @__PURE__ */ jsx(Box, { width: 25, children: /* @__PURE__ */ jsx(Text, { wrap: "truncate-end", children: tc }) }),
88
94
  configs.map((cfg) => {
89
95
  const key = `${tc}_${cfg}`;
90
96
  const run = runs[key];
91
97
  if (!run) {
92
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
98
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "gray", children: "queued" }) }, cfg);
93
99
  }
94
100
  if (run.status === "running") {
95
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
101
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsx(Text, { color: "yellow", children: "running..." }) }, cfg);
96
102
  }
97
103
  if (run.status === "failed" || !run.passed) {
98
104
  const seconds2 = (run.durationMs / 1e3).toFixed(1);
99
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "red", children: [
105
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "red", wrap: "truncate-end", children: [
100
106
  "\u2717 ",
101
107
  seconds2,
102
108
  "s ($",
@@ -105,7 +111,7 @@ var Dashboard = ({ runs, testCases, configs, isFinished }) => {
105
111
  ] }) }, cfg);
106
112
  }
107
113
  const seconds = (run.durationMs / 1e3).toFixed(1);
108
- return /* @__PURE__ */ jsx(Box, { width: 20, children: /* @__PURE__ */ jsxs(Text, { color: "green", children: [
114
+ return /* @__PURE__ */ jsx(Box, { width: CONFIG_COL_WIDTH, children: /* @__PURE__ */ jsxs(Text, { color: "green", wrap: "truncate-end", children: [
109
115
  "\u2713 ",
110
116
  seconds,
111
117
  "s ($",
@@ -419,20 +425,54 @@ function printTagBreakdown(testCases, agentConfigs, runStates) {
419
425
  }
420
426
  console.log("=================================================\n");
421
427
  }
422
- async function validateCommand(testCasePath) {
428
+ function isSkippedCheck(check) {
429
+ return check.name.toLowerCase().includes("(skipped") || check.detail.toLowerCase().includes("skipping");
430
+ }
431
+ function checkIcon(check) {
432
+ if (!check.passed) return "\u274C";
433
+ if (isSkippedCheck(check)) return "\u26A0\uFE0F";
434
+ return "\u2705";
435
+ }
436
+ async function validateCommand(testCasePath, opts) {
423
437
  const testCase = loadTestCase(testCasePath);
438
+ if (opts?.strict) {
439
+ const missing = [];
440
+ if (!testCase.test_command) missing.push("test_command");
441
+ if (!testCase.fail_to_pass?.length) missing.push("fail_to_pass");
442
+ if (!testCase.pass_to_pass?.length) missing.push("pass_to_pass");
443
+ if (missing.length > 0) {
444
+ console.error(
445
+ `Strict validation requires: ${missing.join(", ")}. Fill these fields before running in CI.`
446
+ );
447
+ process.exit(1);
448
+ }
449
+ }
424
450
  console.log(`Validating "${testCase.name}" (${testCasePath})...
425
451
  `);
426
452
  const sandboxProvider = new DockerSandboxProvider();
427
453
  const report = await validateTestCase({ testCase, sandboxProvider });
454
+ const hadExecutionSkip = report.checks.some(
455
+ (c) => c.name.includes("execution-checks (skipped")
456
+ );
428
457
  for (const check of report.checks) {
429
- const icon = check.passed ? "\u2705" : "\u274C";
458
+ const icon = checkIcon(check);
430
459
  console.log(`${icon} ${check.name}`);
431
460
  if (check.detail && check.detail !== "ok") {
432
461
  const indented = check.detail.split("\n").map((line) => ` ${line}`).join("\n");
433
462
  console.log(indented);
434
463
  }
435
464
  }
465
+ if (hadExecutionSkip) {
466
+ console.log("");
467
+ console.log(
468
+ "Note: this was a static-only validation (no test_command configured) - Docker/patch execution checks were skipped."
469
+ );
470
+ if (report.ok && !opts?.strict) {
471
+ console.log(
472
+ "Tip: run with --strict to enforce test_command, fail_to_pass, and pass_to_pass as a CI gate."
473
+ );
474
+ }
475
+ }
436
476
  console.log("");
437
477
  console.log(report.ok ? "\u2705 Validation passed." : "\u274C Validation failed.");
438
478
  process.exit(report.ok ? 0 : 1);
@@ -477,26 +517,40 @@ async function importPrCommand(repo, prNumber, opts) {
477
517
  if (testDiff.trim()) {
478
518
  writeFileSync(resolve(outDir, "test_patch.patch"), testDiff);
479
519
  }
520
+ const fixtureDir = resolve(outDir, "fixture");
521
+ if (opts.cloneFixture) {
522
+ console.log(`
523
+ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
524
+ execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
525
+ stdio: "inherit"
526
+ });
527
+ console.log(`Checking out base commit ${pr.base.sha}...`);
528
+ execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
529
+ }
530
+ const projectKind = opts.cloneFixture ? detectProjectKind(fixtureDir) : "unknown";
531
+ const { success, test_command, testCommandHint } = projectTestDefaults(
532
+ projectKind,
533
+ opts.cloneFixture ?? false,
534
+ fixtureDir
535
+ );
480
536
  const yamlDoc = {
481
537
  name: slug,
482
538
  description: pr.title,
483
539
  fixture: "./fixture",
484
540
  prompt: buildPrompt(pr),
485
- success: [{ run: "npm install && npm test", expect: { exit_code: 0 } }],
541
+ success,
486
542
  timeout_seconds: 600,
487
543
  tags: ["imported", repoName],
488
544
  created_at: pr.created_at,
489
- // TODO: fill these in after setting up ./fixture (checked out at
490
- // base.sha below) and running the test suite to discover real test names.
491
- test_command: "<TODO: e.g. tsx --test --test-reporter=tap src/**/*.test.ts>",
492
- fail_to_pass: ["<TODO: fill in via `agr validate`>"],
493
- pass_to_pass: ["<TODO: fill in via `agr validate`>"]
545
+ test_command,
546
+ fail_to_pass: [],
547
+ pass_to_pass: []
494
548
  };
495
549
  if (solutionDiff.trim()) yamlDoc.solution = "./solution.patch";
496
550
  if (testDiff.trim()) yamlDoc.test_patch = "./test_patch.patch";
497
551
  if (expectedFiles.length > 0) yamlDoc.expected_files = expectedFiles;
498
552
  if (forbidModified.length > 0) yamlDoc.forbid_modified = forbidModified;
499
- writeFileSync(resolve(outDir, "agr.yaml"), stringify(yamlDoc));
553
+ writeFileSync(resolve(outDir, "agr.yaml"), buildAgrYaml(yamlDoc, testCommandHint));
500
554
  console.log(`
501
555
  Imported PR #${pr.number}: "${pr.title}"`);
502
556
  console.log(`Wrote scaffold to: ${outDir}`);
@@ -505,16 +559,6 @@ Imported PR #${pr.number}: "${pr.title}"`);
505
559
  console.log(` - solution.patch (${expectedFiles.length} file(s) changed)`);
506
560
  if (testDiff.trim())
507
561
  console.log(` - test_patch.patch (${forbidModified.length} test file(s) changed)`);
508
- if (opts.cloneFixture) {
509
- const fixtureDir = resolve(outDir, "fixture");
510
- console.log(`
511
- Cloning ${owner}/${repoName} into ${fixtureDir}...`);
512
- execFileSync("git", ["clone", `https://github.com/${owner}/${repoName}.git`, fixtureDir], {
513
- stdio: "inherit"
514
- });
515
- console.log(`Checking out base commit ${pr.base.sha}...`);
516
- execFileSync("git", ["checkout", pr.base.sha], { cwd: fixtureDir, stdio: "inherit" });
517
- }
518
562
  console.log("\nNext steps:");
519
563
  if (!opts.cloneFixture) {
520
564
  console.log(` 1. Check out ${owner}/${repoName}@${pr.base.sha} into ${outDir}/fixture`);
@@ -522,8 +566,11 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
522
566
  console.log(
523
567
  ` 3. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
524
568
  );
569
+ console.log(
570
+ "\nNote: test_command/success defaults were NOT auto-detected because --clone-fixture was not set. Re-run with --clone-fixture to get language-specific defaults, or fill these fields manually."
571
+ );
525
572
  } else {
526
- console.log(" 1. Fill in test_command, fail_to_pass, and pass_to_pass in agr.yaml");
573
+ console.log(" 1. Fill in fail_to_pass and pass_to_pass in agr.yaml");
527
574
  console.log(
528
575
  ` 2. Run "agr validate ${resolve(outDir, "agr.yaml")}" to verify the test case`
529
576
  );
@@ -533,6 +580,92 @@ Cloning ${owner}/${repoName} into ${fixtureDir}...`);
533
580
  await validateCommand(resolve(outDir, "agr.yaml"));
534
581
  }
535
582
  }
583
+ function detectProjectKind(fixtureDir) {
584
+ if (existsSync(resolve(fixtureDir, "pyproject.toml")) || existsSync(resolve(fixtureDir, "setup.py")) || readdirSync(fixtureDir).some((name) => /^requirements.*\.txt$/i.test(name))) {
585
+ return "python";
586
+ }
587
+ if (existsSync(resolve(fixtureDir, "package.json"))) return "node";
588
+ if (existsSync(resolve(fixtureDir, "go.mod"))) return "go";
589
+ return "unknown";
590
+ }
591
+ function projectTestDefaults(kind, cloned, fixtureDir) {
592
+ if (!cloned) {
593
+ return {
594
+ success: [
595
+ { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
596
+ ],
597
+ test_command: "<TODO: shell command that runs tests with TAP output>",
598
+ testCommandHint: "none"
599
+ };
600
+ }
601
+ switch (kind) {
602
+ case "python":
603
+ return {
604
+ success: [{ run: "pip install -e . && pytest", expect: { exit_code: 0 } }],
605
+ test_command: "pytest --tap-stream",
606
+ testCommandHint: "python"
607
+ };
608
+ case "node":
609
+ return detectNodeTestRunner(fixtureDir);
610
+ case "go":
611
+ return {
612
+ success: [{ run: "go test ./...", expect: { exit_code: 0 } }],
613
+ test_command: "<TODO: configure a TAP-producing test command for go>",
614
+ testCommandHint: "go"
615
+ };
616
+ default:
617
+ return {
618
+ success: [
619
+ { run: "<TODO: install dependencies and run the test suite>", expect: { exit_code: 0 } }
620
+ ],
621
+ test_command: "<TODO: shell command that runs tests with TAP output>",
622
+ testCommandHint: "none"
623
+ };
624
+ }
625
+ }
626
+ function detectNodeTestRunner(fixtureDir) {
627
+ const success = [{ run: "npm install && npm test", expect: { exit_code: 0 } }];
628
+ const fallback = {
629
+ success,
630
+ test_command: "tsx --test --test-reporter=tap src/**/*.test.ts",
631
+ testCommandHint: "node-unknown"
632
+ };
633
+ try {
634
+ const pkgPath = resolve(fixtureDir, "package.json");
635
+ if (!existsSync(pkgPath)) return fallback;
636
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
637
+ const deps = { ...pkg.dependencies, ...pkg.devDependencies };
638
+ if (deps.ava) {
639
+ return { success, test_command: "npx ava --tap", testCommandHint: "ava" };
640
+ }
641
+ if (deps.vitest) {
642
+ return { success, test_command: "npx vitest run --reporter=tap", testCommandHint: "vitest" };
643
+ }
644
+ if (deps.jest) {
645
+ return { success, test_command: "npx jest --ci", testCommandHint: "jest" };
646
+ }
647
+ return fallback;
648
+ } catch {
649
+ return fallback;
650
+ }
651
+ }
652
+ function buildAgrYaml(doc, testCommandHint) {
653
+ let yaml = stringify(doc);
654
+ const testListComment = "# TODO: run the test suite (see test_command above) and add real test names here.\n# agr validate checks pre/post-patch status once these fields are filled in.";
655
+ yaml = yaml.replace(/^fail_to_pass:/m, `${testListComment}
656
+ fail_to_pass:`);
657
+ const testCommandComments = {
658
+ python: "# Requires pytest-tap for TAP output (pip install pytest-tap).",
659
+ jest: "# jest does not output TAP by default; consider jest-tap-reporter",
660
+ "node-unknown": "# test_command could not be auto-detected reliably - verify this matches the project's actual test setup"
661
+ };
662
+ const comment = testCommandComments[testCommandHint];
663
+ if (comment) {
664
+ yaml = yaml.replace(/^test_command: (.+)$/m, `${comment}
665
+ $&`);
666
+ }
667
+ return yaml;
668
+ }
536
669
  function buildPrompt(pr) {
537
670
  const body = (pr.body || "").trim();
538
671
  return body ? `${pr.title}
@@ -563,6 +696,35 @@ function splitDiff(diff) {
563
696
  forbidModified
564
697
  };
565
698
  }
699
+ var VERBOSE_CONTENT_MAX = 200;
700
+ function truncateForVerbose(value, max = VERBOSE_CONTENT_MAX) {
701
+ if (value.length <= max) return value;
702
+ return `${value.slice(0, max)}...`;
703
+ }
704
+ function formatVerboseStep(step) {
705
+ const prefix = `[step ${step.index}] ${step.kind}`;
706
+ if (step.kind === "tool_call" && step.tool) {
707
+ const args = step.content ? truncateForVerbose(step.content) : "";
708
+ return `${prefix}: ${step.tool}(${args})`;
709
+ }
710
+ if (step.kind === "tool_result" && step.tool) {
711
+ const result = step.content ? truncateForVerbose(step.content) : "";
712
+ return `${prefix}: ${step.tool} -> ${result}`;
713
+ }
714
+ if (step.kind === "message" && step.content) {
715
+ return `${prefix}: ${truncateForVerbose(step.content)}`;
716
+ }
717
+ if (step.content) {
718
+ return `${prefix}: ${truncateForVerbose(step.content)}`;
719
+ }
720
+ return prefix;
721
+ }
722
+ function formatMetricDetail(label, detail) {
723
+ if (/^No .+ configured; skipping/.test(detail)) {
724
+ return `\u26A0\uFE0F ${label}: ${detail}`;
725
+ }
726
+ return `${label}: ${detail}`;
727
+ }
566
728
  async function runSingleCommand(testCasePath, opts) {
567
729
  const testCase = loadTestCase(testCasePath);
568
730
  let agentConfig = {
@@ -595,7 +757,10 @@ async function runSingleCommand(testCasePath, opts) {
595
757
  adapter,
596
758
  sandboxProvider,
597
759
  db,
598
- runId
760
+ runId,
761
+ onStep: opts.verbose ? (step) => {
762
+ console.log(formatVerboseStep(step));
763
+ } : void 0
599
764
  });
600
765
  console.log("\n================ RUN SUMMARY ================");
601
766
  console.log(`Status: ${result.passed ? "\u2705 PASSED" : "\u274C FAILED"}`);
@@ -606,13 +771,15 @@ async function runSingleCommand(testCasePath, opts) {
606
771
  console.log(`Error: ${result.error}`);
607
772
  }
608
773
  if (result.metrics?.regression) {
609
- console.log(`Regression: ${result.metrics.regression.detail}`);
774
+ console.log(formatMetricDetail("Regression", result.metrics.regression.detail));
610
775
  }
611
776
  if (result.metrics?.diff) {
612
777
  console.log(`Diff scope: ${result.metrics.diff.detail.split("\n")[0]}`);
613
778
  }
614
779
  if (result.metrics?.localization) {
615
- console.log(`Localization: ${result.metrics.localization.detail.split("\n")[0]}`);
780
+ console.log(
781
+ formatMetricDetail("Localization", result.metrics.localization.detail.split("\n")[0])
782
+ );
616
783
  }
617
784
  console.log("=============================================\n");
618
785
  } catch (err) {
@@ -701,7 +868,10 @@ function safeParseJson(value) {
701
868
 
702
869
  // src/index.ts
703
870
  var cli = cac("agr");
704
- cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").action(async (testCase, options) => {
871
+ cli.command("run <testCase>", "Run a single agent test case").option("--config <config>", "Path to an AgentConfig YAML file").option(
872
+ "--verbose",
873
+ "Stream agent steps live to the console as they happen"
874
+ ).action(async (testCase, options) => {
705
875
  try {
706
876
  await runSingleCommand(testCase, options);
707
877
  } catch (err) {
@@ -709,12 +879,17 @@ cli.command("run <testCase>", "Run a single agent test case").option("--config <
709
879
  process.exit(1);
710
880
  }
711
881
  });
712
- cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
882
+ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs").option("--configs <configs>", "Comma-separated paths to AgentConfig YAML files").option("--config <config>", "Alias for --configs (single config path)").option("--suite <suite>", "Path to test suite directory containing test cases").option("--concurrency <concurrency>", "Number of parallel sandbox executions", { default: 2 }).option(
713
883
  "--matrix <matrix>",
714
884
  "Path to an optimizer matrix YAML file - expands into agent configs and prints a Pareto summary afterwards (alternative to --configs)"
715
- ).action(async (options) => {
885
+ ).example("agr bench --suite tasks --configs agent.yaml,agent-openrouter.yaml").example("agr bench --suite tasks --matrix optimizer-matrix.yaml").action(async (options) => {
886
+ if (!options.configs && options.config) {
887
+ options.configs = options.config;
888
+ }
716
889
  if (!options.suite || !options.configs && !options.matrix) {
717
- console.error("Error: --suite and either --configs or --matrix are required for benchmarking.");
890
+ console.error(
891
+ "Error: --suite and either --configs, --config, or --matrix are required for benchmarking."
892
+ );
718
893
  process.exit(1);
719
894
  }
720
895
  try {
@@ -732,9 +907,12 @@ cli.command("bench", "Run a benchmark matrix of multiple test cases and configs"
732
907
  cli.command(
733
908
  "validate <testCase>",
734
909
  "Validate a test case definition (fixture, fail_to_pass/pass_to_pass, gold patch)"
735
- ).action(async (testCase) => {
910
+ ).option(
911
+ "--strict",
912
+ "Exit with code 1 if test_command or fail_to_pass/pass_to_pass are missing"
913
+ ).action(async (testCase, options) => {
736
914
  try {
737
- await validateCommand(testCase);
915
+ await validateCommand(testCase, options);
738
916
  } catch (err) {
739
917
  console.error(`Error executing validate: ${err.message}`);
740
918
  process.exit(1);
@@ -743,7 +921,10 @@ cli.command(
743
921
  cli.command(
744
922
  "import-pr <repo> <prNumber>",
745
923
  "Scaffold a test case from a GitHub pull request (e.g. owner/repo 1234)"
746
- ).option("--out <dir>", "Output directory for the scaffolded test case").option("--clone-fixture", "Clone the repo and check out the PR's base commit into ./fixture").option("--validate", "Run `agr validate` against the scaffolded test case afterwards").action(async (repo, prNumber, options) => {
924
+ ).option("--out <dir>", "Output directory for the scaffolded test case").option(
925
+ "--clone-fixture",
926
+ "Clone the repo and check out the PR's base commit into ./fixture (required for language/test-command auto-detection)"
927
+ ).option("--validate", "Run `agr validate` against the scaffolded test case afterwards").example("agr import-pr astropy/astropy 12907 --clone-fixture --validate").action(async (repo, prNumber, options) => {
747
928
  try {
748
929
  await importPrCommand(repo, prNumber, options);
749
930
  } catch (err) {
package/package.json CHANGED
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "agentgrader",
3
- "version": "1.0.5",
3
+ "version": "1.0.7",
4
4
  "description": "CLI for the Agentgrader benchmarking framework — run and bench coding agents",
5
5
  "license": "MIT",
6
6
  "type": "module",
7
7
  "bin": {
8
- "agr": "./dist/index.js",
9
- "agentgrader": "./dist/index.js"
8
+ "agr": "dist/index.js",
9
+ "agentgrader": "dist/index.js"
10
10
  },
11
11
  "main": "./dist/index.js",
12
12
  "types": "./dist/index.d.ts",
@@ -19,12 +19,12 @@
19
19
  "dev": "bun run src/index.ts"
20
20
  },
21
21
  "dependencies": {
22
- "@agentgrader/agent-openrouter": "^2.0.1",
23
- "@agentgrader/core": "^1.1.1",
24
- "@agentgrader/optimizer": "^0.1.0",
22
+ "@agentgrader/agent-openrouter": "^2.0.3",
23
+ "@agentgrader/core": "^1.1.3",
24
+ "@agentgrader/optimizer": "^0.1.1",
25
25
  "@agentgrader/sandbox-docker": "^2.0.2",
26
26
  "@agentgrader/scorer-static": "^0.1.0",
27
- "@agentgrader/store": "^1.0.2",
27
+ "@agentgrader/store": "^1.0.3",
28
28
  "cac": "^6.7.14",
29
29
  "dotenv": "^17.4.2",
30
30
  "ink": "^4.4.1",