@agentgrader/scorer-llm-judge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +30 -0
- package/dist/index.js +61 -0
- package/package.json +36 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { Scorer, TestCase, AgentResult, ScorerResult } from '@agentgrader/core';
|
|
2
|
+
|
|
3
|
+
interface LlmJudgeScorerOptions {
|
|
4
|
+
/** which AI SDK provider to judge with. Defaults to "anthropic". */
|
|
5
|
+
provider?: "anthropic" | "openai";
|
|
6
|
+
/** model slug for the chosen provider. Defaults to a small, cheap model suitable for judging. */
|
|
7
|
+
model?: string;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Additive, non-blocking LLM-as-judge scorer: asks a model to rate the
|
|
11
|
+
* agent's diff (0-1) for code quality and fit to the task, given the
|
|
12
|
+
* original prompt. Never blocks a run (`passed` is always `true`) - it
|
|
13
|
+
* only annotates `metrics["llm-judge"].quality.{llmJudgeScore,llmJudgeDetail}`.
|
|
14
|
+
*
|
|
15
|
+
* Degrades gracefully (no `quality` field) if the diff is empty or the
|
|
16
|
+
* judge call fails (e.g. missing API key, network unavailable).
|
|
17
|
+
*/
|
|
18
|
+
declare class LlmJudgeScorer implements Scorer {
|
|
19
|
+
readonly name = "llm-judge";
|
|
20
|
+
private readonly provider;
|
|
21
|
+
private readonly modelName;
|
|
22
|
+
constructor(options?: LlmJudgeScorerOptions);
|
|
23
|
+
score(input: {
|
|
24
|
+
testCase: TestCase;
|
|
25
|
+
result: AgentResult;
|
|
26
|
+
}): Promise<ScorerResult>;
|
|
27
|
+
private resolveModel;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export { LlmJudgeScorer, type LlmJudgeScorerOptions };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { createAnthropic } from '@ai-sdk/anthropic';
|
|
2
|
+
import { createOpenAI } from '@ai-sdk/openai';
|
|
3
|
+
import { generateObject } from 'ai';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
|
|
6
|
+
// src/index.ts
|
|
7
|
+
var JudgeResponseSchema = z.object({
|
|
8
|
+
score: z.number().min(0).max(1),
|
|
9
|
+
rationale: z.string()
|
|
10
|
+
});
|
|
11
|
+
var DEFAULT_MODELS = {
|
|
12
|
+
anthropic: "claude-3-5-haiku-20241022",
|
|
13
|
+
openai: "gpt-4o-mini"
|
|
14
|
+
};
|
|
15
|
+
var LlmJudgeScorer = class {
|
|
16
|
+
name = "llm-judge";
|
|
17
|
+
provider;
|
|
18
|
+
modelName;
|
|
19
|
+
constructor(options = {}) {
|
|
20
|
+
this.provider = options.provider ?? "anthropic";
|
|
21
|
+
this.modelName = options.model ?? DEFAULT_MODELS[this.provider];
|
|
22
|
+
}
|
|
23
|
+
async score(input) {
|
|
24
|
+
const diff = input.result.finalDiff ?? "";
|
|
25
|
+
if (!diff.trim()) {
|
|
26
|
+
return { passed: true, detail: "llm-judge: no diff to judge" };
|
|
27
|
+
}
|
|
28
|
+
try {
|
|
29
|
+
const { object } = await generateObject({
|
|
30
|
+
model: this.resolveModel(),
|
|
31
|
+
schema: JudgeResponseSchema,
|
|
32
|
+
system: "You are a senior software engineer reviewing a coding agent's patch. Rate the patch from 0 (poor: incorrect, sloppy, or unrelated changes) to 1 (excellent: correct, minimal, idiomatic, and focused on the task). Respond with a score and a one-sentence rationale.",
|
|
33
|
+
prompt: `Task:
|
|
34
|
+
${input.testCase.prompt}
|
|
35
|
+
|
|
36
|
+
Patch:
|
|
37
|
+
${diff}`
|
|
38
|
+
});
|
|
39
|
+
return {
|
|
40
|
+
passed: true,
|
|
41
|
+
detail: `llm-judge: ${object.score.toFixed(2)} - ${object.rationale}`,
|
|
42
|
+
quality: {
|
|
43
|
+
llmJudgeScore: object.score,
|
|
44
|
+
llmJudgeDetail: object.rationale
|
|
45
|
+
}
|
|
46
|
+
};
|
|
47
|
+
} catch (e) {
|
|
48
|
+
return { passed: true, detail: `llm-judge unavailable: ${e.message}` };
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
resolveModel() {
|
|
52
|
+
if (this.provider === "openai") {
|
|
53
|
+
const openai = createOpenAI({ apiKey: process.env.OPENAI_API_KEY || "mock-key" });
|
|
54
|
+
return openai(this.modelName);
|
|
55
|
+
}
|
|
56
|
+
const anthropic = createAnthropic({ apiKey: process.env.ANTHROPIC_API_KEY || "mock-key" });
|
|
57
|
+
return anthropic(this.modelName);
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
export { LlmJudgeScorer };
|
package/package.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@agentgrader/scorer-llm-judge",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Additive, non-blocking LLM-as-judge code-quality scorer for the Agentgrader framework",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"main": "./dist/index.js",
|
|
8
|
+
"module": "./dist/index.js",
|
|
9
|
+
"types": "./dist/index.d.ts",
|
|
10
|
+
"exports": {
|
|
11
|
+
".": {
|
|
12
|
+
"import": "./dist/index.js",
|
|
13
|
+
"types": "./dist/index.d.ts"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"dist"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "tsup src/index.ts --format esm --dts --clean --treeshake",
|
|
21
|
+
"build:watch": "tsup src/index.ts --format esm --dts --watch"
|
|
22
|
+
},
|
|
23
|
+
"dependencies": {
|
|
24
|
+
"@agentgrader/core": "^1.1.0",
|
|
25
|
+
"@ai-sdk/anthropic": "^3.0.82",
|
|
26
|
+
"@ai-sdk/openai": "^1.1.0",
|
|
27
|
+
"ai": "^4.1.0",
|
|
28
|
+
"zod": "^3.23.8"
|
|
29
|
+
},
|
|
30
|
+
"devDependencies": {
|
|
31
|
+
"tsup": "^8.5.1"
|
|
32
|
+
},
|
|
33
|
+
"peerDependencies": {
|
|
34
|
+
"@agentgrader/core": "^1.1.0"
|
|
35
|
+
}
|
|
36
|
+
}
|