@agentgrader/core 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +39 -0
- package/dist/index.js +44 -10
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -131,6 +131,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
131
131
|
id: z.ZodOptional<z.ZodString>;
|
|
132
132
|
name: z.ZodString;
|
|
133
133
|
model: z.ZodString;
|
|
134
|
+
provider: z.ZodOptional<z.ZodString>;
|
|
134
135
|
max_steps: z.ZodDefault<z.ZodNumber>;
|
|
135
136
|
temperature: z.ZodOptional<z.ZodNumber>;
|
|
136
137
|
system_prompt: z.ZodOptional<z.ZodString>;
|
|
@@ -167,6 +168,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
167
168
|
max_steps: number;
|
|
168
169
|
id?: string | undefined;
|
|
169
170
|
toolkits?: string[] | undefined;
|
|
171
|
+
provider?: string | undefined;
|
|
170
172
|
temperature?: number | undefined;
|
|
171
173
|
system_prompt?: string | undefined;
|
|
172
174
|
tools?: string[] | undefined;
|
|
@@ -184,6 +186,7 @@ declare const AgentConfigSchema: z.ZodObject<{
|
|
|
184
186
|
model: string;
|
|
185
187
|
id?: string | undefined;
|
|
186
188
|
toolkits?: string[] | undefined;
|
|
189
|
+
provider?: string | undefined;
|
|
187
190
|
max_steps?: number | undefined;
|
|
188
191
|
temperature?: number | undefined;
|
|
189
192
|
system_prompt?: string | undefined;
|
|
@@ -300,6 +303,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
300
303
|
error: z.ZodOptional<z.ZodString>;
|
|
301
304
|
finalDiff: z.ZodOptional<z.ZodString>;
|
|
302
305
|
metrics: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
306
|
+
matrixId: z.ZodOptional<z.ZodString>;
|
|
303
307
|
createdAt: z.ZodNumber;
|
|
304
308
|
completedAt: z.ZodOptional<z.ZodNumber>;
|
|
305
309
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -319,6 +323,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
319
323
|
error?: string | undefined;
|
|
320
324
|
finalDiff?: string | undefined;
|
|
321
325
|
metrics?: Record<string, any> | undefined;
|
|
326
|
+
matrixId?: string | undefined;
|
|
322
327
|
completedAt?: number | undefined;
|
|
323
328
|
}, {
|
|
324
329
|
status: "running" | "completed" | "failed";
|
|
@@ -337,6 +342,7 @@ declare const RunSchema: z.ZodObject<{
|
|
|
337
342
|
error?: string | undefined;
|
|
338
343
|
finalDiff?: string | undefined;
|
|
339
344
|
metrics?: Record<string, any> | undefined;
|
|
345
|
+
matrixId?: string | undefined;
|
|
340
346
|
completedAt?: number | undefined;
|
|
341
347
|
}>;
|
|
342
348
|
type Run = z.infer<typeof RunSchema>;
|
|
@@ -493,6 +499,26 @@ interface ScorerResult {
|
|
|
493
499
|
passed: boolean;
|
|
494
500
|
detail: string;
|
|
495
501
|
score?: number;
|
|
502
|
+
/**
|
|
503
|
+
* non-functional code-quality signals. populated by additive scorers
|
|
504
|
+
* (e.g. staticqualityscorer, llmjudgescorer) that annotate a run without
|
|
505
|
+
* affecting `passed`/`score`. all fields optional - a scorer only fills
|
|
506
|
+
* in what it actually measured.
|
|
507
|
+
*/
|
|
508
|
+
quality?: {
|
|
509
|
+
/** total +/- lines in the agent's diff */
|
|
510
|
+
diffLines?: number;
|
|
511
|
+
/** number of files touched by the agent's diff */
|
|
512
|
+
filesModified?: number;
|
|
513
|
+
/** TODO/FIXME/HACK/XXX markers introduced by the diff */
|
|
514
|
+
todosIntroduced?: number;
|
|
515
|
+
/** linter (e.g. Biome) error+warning count on the changed files */
|
|
516
|
+
linterViolations?: number;
|
|
517
|
+
/** 0-1 holistic quality score from an LLM judge */
|
|
518
|
+
llmJudgeScore?: number;
|
|
519
|
+
/** prose rationale from an LLM judge */
|
|
520
|
+
llmJudgeDetail?: string;
|
|
521
|
+
};
|
|
496
522
|
}
|
|
497
523
|
interface Scorer {
|
|
498
524
|
readonly name: string;
|
|
@@ -618,6 +644,15 @@ interface RunSingleInput {
|
|
|
618
644
|
sandboxProvider: SandboxProvider;
|
|
619
645
|
db?: AgrDb;
|
|
620
646
|
runId: string;
|
|
647
|
+
/**
|
|
648
|
+
* additive, non-blocking scorers (e.g. staticqualityscorer,
|
|
649
|
+
* llmjudgescorer) run after the core pass/fail scoring. their results
|
|
650
|
+
* never affect `passed`/`score` - each scorer's `scorerresult` is merged
|
|
651
|
+
* into `metrics` under its own `name`.
|
|
652
|
+
*/
|
|
653
|
+
extraScorers?: Scorer[];
|
|
654
|
+
/** links this run to an optimizer matrix run, if any */
|
|
655
|
+
matrixId?: string;
|
|
621
656
|
}
|
|
622
657
|
interface RunSingleResult {
|
|
623
658
|
runId: string;
|
|
@@ -642,6 +677,10 @@ interface BenchmarkInput {
|
|
|
642
677
|
sandboxProvider: SandboxProvider;
|
|
643
678
|
db?: AgrDb;
|
|
644
679
|
concurrency?: number;
|
|
680
|
+
/** additive, non-blocking quality scorers run for every test case x config combination */
|
|
681
|
+
extraScorers?: Scorer[];
|
|
682
|
+
/** links every run in this benchmark to an optimizer matrix run, if any */
|
|
683
|
+
matrixId?: string;
|
|
645
684
|
onRunUpdate?: (run: RunSingleResult & {
|
|
646
685
|
testCaseId: string;
|
|
647
686
|
agentConfigId: string;
|
package/dist/index.js
CHANGED
|
@@ -79,6 +79,7 @@ var AgentConfigSchema = z.object({
|
|
|
79
79
|
id: z.string().optional(),
|
|
80
80
|
name: z.string(),
|
|
81
81
|
model: z.string(),
|
|
82
|
+
provider: z.string().optional(),
|
|
82
83
|
max_steps: z.number().default(30),
|
|
83
84
|
temperature: z.number().optional(),
|
|
84
85
|
system_prompt: z.string().optional(),
|
|
@@ -105,8 +106,12 @@ var RunSchema = z.object({
|
|
|
105
106
|
error: z.string().optional(),
|
|
106
107
|
finalDiff: z.string().optional(),
|
|
107
108
|
// extended scoring metrics: regression (FAIL_TO_PASS/PASS_TO_PASS), diff stats,
|
|
108
|
-
// localization precision/recall,
|
|
109
|
+
// localization precision/recall, additive quality scorers (keyed by scorer
|
|
110
|
+
// name, ScorerResult-shaped), etc. Stored as JSON.
|
|
109
111
|
metrics: z.record(z.any()).optional(),
|
|
112
|
+
// links this run back to the optimizer matrix run that generated its
|
|
113
|
+
// agentConfig, if any (see @agentgrader/optimizer).
|
|
114
|
+
matrixId: z.string().optional(),
|
|
110
115
|
createdAt: z.number(),
|
|
111
116
|
completedAt: z.number().optional()
|
|
112
117
|
});
|
|
@@ -575,7 +580,7 @@ function buildSkillsPromptAddendum(skills) {
|
|
|
575
580
|
|
|
576
581
|
// src/runner/run-single.ts
|
|
577
582
|
async function runSingle(input) {
|
|
578
|
-
const { testCase, agentConfig, adapter, sandboxProvider, db, runId } = input;
|
|
583
|
+
const { testCase, agentConfig, adapter, sandboxProvider, db, runId, extraScorers, matrixId } = input;
|
|
579
584
|
const startTime = Date.now();
|
|
580
585
|
let sandbox = null;
|
|
581
586
|
let passed = false;
|
|
@@ -598,6 +603,7 @@ async function runSingle(input) {
|
|
|
598
603
|
agentConfigId: agentConfig.id || agentConfig.name,
|
|
599
604
|
sandboxProvider: sandboxProvider.name,
|
|
600
605
|
status: "running",
|
|
606
|
+
matrixId,
|
|
601
607
|
createdAt: Math.floor(startTime / 1e3)
|
|
602
608
|
});
|
|
603
609
|
}
|
|
@@ -701,6 +707,21 @@ ${addendum}` : addendum
|
|
|
701
707
|
}),
|
|
702
708
|
execute: async () => {
|
|
703
709
|
if (!sandbox) throw new Error("Sandbox not initialized");
|
|
710
|
+
if (extraScorers && extraScorers.length > 0 && agentResult) {
|
|
711
|
+
const trace2 = { runId, steps: emittedSteps };
|
|
712
|
+
for (const scorer of extraScorers) {
|
|
713
|
+
try {
|
|
714
|
+
metrics[scorer.name] = await scorer.score({
|
|
715
|
+
testCase,
|
|
716
|
+
result: agentResult,
|
|
717
|
+
trace: trace2,
|
|
718
|
+
sandbox
|
|
719
|
+
});
|
|
720
|
+
} catch (e) {
|
|
721
|
+
metrics[scorer.name] = { passed: true, detail: `Scorer error: ${e.message}` };
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
}
|
|
704
725
|
const cmdScorer = new CommandScorer();
|
|
705
726
|
const cmdResult = await cmdScorer.score({
|
|
706
727
|
testCase,
|
|
@@ -790,10 +811,17 @@ ${addendum}` : addendum
|
|
|
790
811
|
inputData: {},
|
|
791
812
|
initialState: runState
|
|
792
813
|
});
|
|
793
|
-
const
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
814
|
+
const scoreStep2 = res.steps?.score;
|
|
815
|
+
if (res.status === "success" && scoreStep2?.status === "success") {
|
|
816
|
+
const scoreOutput = scoreStep2.output;
|
|
817
|
+
passed = scoreOutput?.passed ?? false;
|
|
818
|
+
score = scoreOutput?.score ?? 0;
|
|
819
|
+
errorMsg = scoreOutput?.passed ? void 0 : scoreOutput?.detail;
|
|
820
|
+
} else {
|
|
821
|
+
passed = false;
|
|
822
|
+
score = 0;
|
|
823
|
+
errorMsg = res.error?.message ?? "Workflow did not complete successfully";
|
|
824
|
+
}
|
|
797
825
|
} catch (err) {
|
|
798
826
|
errorMsg = err.message || "Unknown execution error";
|
|
799
827
|
passed = false;
|
|
@@ -836,7 +864,7 @@ ${addendum}` : addendum
|
|
|
836
864
|
};
|
|
837
865
|
}
|
|
838
866
|
async function runBenchmark(input) {
|
|
839
|
-
const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, onRunUpdate } = input;
|
|
867
|
+
const { testCases, agentConfigs, adapter, adapters, sandboxProvider, db, concurrency = 2, extraScorers, matrixId, onRunUpdate } = input;
|
|
840
868
|
const actualAdapters = adapters || (adapter ? [adapter] : []);
|
|
841
869
|
if (actualAdapters.length === 0) {
|
|
842
870
|
throw new Error("You must provide either 'adapter' or 'adapters' to runBenchmark.");
|
|
@@ -883,6 +911,8 @@ async function runBenchmark(input) {
|
|
|
883
911
|
const sandboxProvider2 = getVal("sandboxProvider");
|
|
884
912
|
const db2 = getVal("db");
|
|
885
913
|
const onRunUpdate2 = getVal("onRunUpdate");
|
|
914
|
+
const extraScorers2 = getVal("extraScorers");
|
|
915
|
+
const matrixId2 = getVal("matrixId");
|
|
886
916
|
const runId = randomUUID();
|
|
887
917
|
if (onRunUpdate2) {
|
|
888
918
|
onRunUpdate2({
|
|
@@ -906,7 +936,9 @@ async function runBenchmark(input) {
|
|
|
906
936
|
adapter: adapter2,
|
|
907
937
|
sandboxProvider: sandboxProvider2,
|
|
908
938
|
db: db2,
|
|
909
|
-
runId
|
|
939
|
+
runId,
|
|
940
|
+
extraScorers: extraScorers2,
|
|
941
|
+
matrixId: matrixId2
|
|
910
942
|
});
|
|
911
943
|
if (onRunUpdate2) {
|
|
912
944
|
onRunUpdate2({
|
|
@@ -953,7 +985,9 @@ async function runBenchmark(input) {
|
|
|
953
985
|
["adapters", actualAdapters],
|
|
954
986
|
["sandboxProvider", sandboxProvider],
|
|
955
987
|
["db", db],
|
|
956
|
-
["onRunUpdate", onRunUpdate]
|
|
988
|
+
["onRunUpdate", onRunUpdate],
|
|
989
|
+
["extraScorers", extraScorers],
|
|
990
|
+
["matrixId", matrixId]
|
|
957
991
|
]);
|
|
958
992
|
const run = await workflow.createRun();
|
|
959
993
|
const res = await run.start({
|
|
@@ -961,7 +995,7 @@ async function runBenchmark(input) {
|
|
|
961
995
|
initialState: runState,
|
|
962
996
|
requestContext: executionContext
|
|
963
997
|
});
|
|
964
|
-
const rawRuns = res.
|
|
998
|
+
const rawRuns = res.steps?.executeSingleRun?.output || [];
|
|
965
999
|
return {
|
|
966
1000
|
runs: Array.isArray(rawRuns) ? rawRuns : []
|
|
967
1001
|
};
|