coding-agent-benchmarks 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +78 -257
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -1
- package/dist/adapters/claudeCodeCLI.js +0 -6
- package/dist/adapters/claudeCodeCLI.js.map +1 -1
- package/dist/adapters/copilotCLI.d.ts.map +1 -1
- package/dist/adapters/copilotCLI.js +0 -6
- package/dist/adapters/copilotCLI.js.map +1 -1
- package/dist/config/loader.d.ts.map +1 -1
- package/dist/config/loader.js +0 -7
- package/dist/config/loader.js.map +1 -1
- package/dist/evaluator.d.ts +0 -12
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +0 -36
- package/dist/evaluator.js.map +1 -1
- package/dist/runner.js +2 -11
- package/dist/runner.js.map +1 -1
- package/dist/types.d.ts +1 -1
- package/dist/utils/baselineManager.d.ts.map +1 -1
- package/dist/utils/baselineManager.js +0 -4
- package/dist/utils/baselineManager.js.map +1 -1
- package/dist/utils/gitUtils.d.ts.map +1 -1
- package/dist/utils/gitUtils.js +0 -6
- package/dist/utils/gitUtils.js.map +1 -1
- package/dist/utils/workspaceUtils.d.ts.map +1 -1
- package/dist/utils/workspaceUtils.js +0 -5
- package/dist/utils/workspaceUtils.js.map +1 -1
- package/dist/validators/llmJudge.d.ts.map +1 -1
- package/dist/validators/llmJudge.js +3 -3
- package/dist/validators/llmJudge.js.map +1 -1
- package/package.json +1 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"workspaceUtils.js","sourceRoot":"","sources":["../../src/utils/workspaceUtils.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,2CAA6B;AAC7B,uCAAyB;AACzB,yCAAyD;AAEzD;;;;;;;;;GASG;AACI,MAAM,oBAAoB,GAAG,CAAC,YAAqB,EAAU,EAAE;IACpE,
|
|
1
|
+
{"version":3,"file":"workspaceUtils.js","sourceRoot":"","sources":["../../src/utils/workspaceUtils.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,2CAA6B;AAC7B,uCAAyB;AACzB,yCAAyD;AAEzD;;;;;;;;;GASG;AACI,MAAM,oBAAoB,GAAG,CAAC,YAAqB,EAAU,EAAE;IACpE,IAAI,YAAY,EAAE,CAAC;QACjB,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QAC5C,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC5B,OAAO,QAAQ,CAAC;QAClB,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,4CAA4C,QAAQ,EAAE,CAAC,CAAC;IAC1E,CAAC;IAED,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,IAAA,qBAAU,GAAE,CAAC;QAC7B,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,OAAO,CAAC;QACjB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,iCAAiC;IACnC,CAAC;IAED,OAAO,OAAO,CAAC,GAAG,EAAE,CAAC;AACvB,CAAC,CAAC;AAnBW,QAAA,oBAAoB,wBAmB/B;AAEF;;;;;GAKG;AACI,MAAM,gBAAgB,GAAG,CAC9B,aAAqB,EACrB,aAAgC,EACtB,EAAE;IACZ,OAAO,aAAa,CAAC,GAAG,CAAC,YAAY,CAAC,EAAE;QACtC,IAAI,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;YAClC,OAAO,YAAY,CAAC;QACtB,CAAC;QACD,OAAO,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,YAAY,CAAC,CAAC;IACnD,CAAC,CAAC,CAAC;AACL,CAAC,CAAC;AAVW,QAAA,gBAAgB,oBAU3B;AAEF;;;;;GAKG;AACI,MAAM,gBAAgB,GAAG,CAC9B,aAAqB,EACrB,YAA+B,EACW,EAAE;IAC5C,MAAM,aAAa,GAAG,IAAA,wBAAgB,EAAC,aAAa,EAAE,YAAY,CAAC,CAAC;IACpE,MAAM,OAAO,GAA6C,EAAE,CAAC;IAE7D,KAAK,MAAM,YAAY,IAAI,aAAa,EAAE,CAAC;QACzC,IAAI,CAAC;YACH,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;gBACjC,OAAO,CAAC,IAAI,CAAC,2BAA2B,YAAY,EAAE,CAAC,CAAC;gBACxD,SAAS;YACX,CAAC;YAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;YACvD,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,IAAI,CAAC,QAAQ,CAAC,aAAa,EAAE,YAAY,CAAC;gBAChD,OAAO;aACR,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,+BAA+B,YAAY,KAAK,KAAK,EAAE,CAAC,CAAC;QACxE,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC,CAAC;AAzBW,QAAA,gBAAgB,oBAyB3B"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llmJudge.d.ts","sourceRoot":"","sources":["../../src/validators/llmJudge.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,EACL,aAAa,EACb,YAAY,EACZ,gBAAgB,EAEjB,MAAM,UAAU,CAAC;AA6DlB,qBAAa,iBAAkB,YAAW,aAAa;IACrD,SAAgB,IAAI,EAAG,WAAW,CAAU;IAC5C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAqB;IACrC,OAAO,CAAC,YAAY,CAAS;gBAEjB,aAAa,CAAC,EAAE,MAAM,EAAE,KAAK,GAAE,
|
|
1
|
+
{"version":3,"file":"llmJudge.d.ts","sourceRoot":"","sources":["../../src/validators/llmJudge.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,EACL,aAAa,EACb,YAAY,EACZ,gBAAgB,EAEjB,MAAM,UAAU,CAAC;AA6DlB,qBAAa,iBAAkB,YAAW,aAAa;IACrD,SAAgB,IAAI,EAAG,WAAW,CAAU;IAC5C,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAqB;IACrC,OAAO,CAAC,YAAY,CAAS;gBAEjB,aAAa,CAAC,EAAE,MAAM,EAAE,KAAK,GAAE,MAAuB;IAMlE;;OAEG;IACG,QAAQ,CACZ,KAAK,EAAE,SAAS,MAAM,EAAE,EACxB,QAAQ,EAAE,YAAY,GACrB,OAAO,CAAC,gBAAgB,CAAC;IA8E5B;;OAEG;IACH,OAAO,CAAC,mBAAmB;IA+B3B;;OAEG;YACW,UAAU;IA4ExB;;OAEG;IACG,SAAS,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;CAYjE"}
|
|
@@ -60,10 +60,10 @@ Respond ONLY with valid JSON in this exact format:
|
|
|
60
60
|
"summary": "one sentence summary"
|
|
61
61
|
}`;
|
|
62
62
|
class LLMJudgeValidator {
|
|
63
|
-
constructor(workspaceRoot, model = "openai/gpt-
|
|
63
|
+
constructor(workspaceRoot, model = "openai/gpt-5") {
|
|
64
64
|
this.type = "llm-judge";
|
|
65
65
|
this.workspaceRoot = (0, workspaceUtils_1.resolveWorkspaceRoot)(workspaceRoot);
|
|
66
|
-
this.apiToken = (0, githubAuth_1.getGitHubToken)();
|
|
66
|
+
this.apiToken = (0, githubAuth_1.getGitHubToken)();
|
|
67
67
|
this.defaultModel = model;
|
|
68
68
|
}
|
|
69
69
|
/**
|
|
@@ -177,7 +177,7 @@ Be strict but fair in your evaluation.`;
|
|
|
177
177
|
{ role: "system", content: judgeSystemPrompt },
|
|
178
178
|
{ role: "user", content: prompt },
|
|
179
179
|
],
|
|
180
|
-
|
|
180
|
+
max_completion_tokens: 1000,
|
|
181
181
|
response_format: { type: "json_object" },
|
|
182
182
|
}),
|
|
183
183
|
});
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llmJudge.js","sourceRoot":"","sources":["../../src/validators/llmJudge.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,uCAAyB;AACzB,2CAA6B;AAO7B,4DAGiC;AACjC,oDAAqD;AAqCrD,MAAM,iBAAiB,GAAG;;;;;;;;;;;;;;;;;EAiBxB,CAAC;AAEH,MAAa,iBAAiB;IAM5B,YAAY,aAAsB,EAAE,QAAgB,
|
|
1
|
+
{"version":3,"file":"llmJudge.js","sourceRoot":"","sources":["../../src/validators/llmJudge.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,uCAAyB;AACzB,2CAA6B;AAO7B,4DAGiC;AACjC,oDAAqD;AAqCrD,MAAM,iBAAiB,GAAG;;;;;;;;;;;;;;;;;EAiBxB,CAAC;AAEH,MAAa,iBAAiB;IAM5B,YAAY,aAAsB,EAAE,QAAgB,cAAc;QALlD,SAAI,GAAG,WAAoB,CAAC;QAM1C,IAAI,CAAC,aAAa,GAAG,IAAA,qCAAoB,EAAC,aAAa,CAAC,CAAC;QACzD,IAAI,CAAC,QAAQ,GAAG,IAAA,2BAAc,GAAE,CAAC;QACjC,IAAI,CAAC,YAAY,GAAG,KAAK,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CACZ,KAAwB,EACxB,QAAsB;QAEtB,MAAM,SAAS,GAAG,QAAQ,CAAC,kBAAkB,CAAC,QAAQ,CAAC;QAEvD,iCAAiC;QACjC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,CAAC;YACxB,OAAO;gBACL,MAAM,EAAE,IAAI;gBACZ,KAAK,EAAE,CAAC,CAAC,EAAE,mBAAmB;gBAC9B,UAAU,EAAE,EAAE;gBACd,aAAa,EAAE,WAAW;aAC3B,CAAC;QACJ,CAAC;QAED,wBAAwB;QACxB,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACnB,OAAO,CAAC,IAAI,CAAC,uDAAuD,CAAC,CAAC;YACtE,OAAO;gBACL,MAAM,EAAE,IAAI;gBACZ,KAAK,EAAE,CAAC,CAAC,EAAE,mBAAmB;gBAC9B,UAAU,EAAE,EAAE;gBACd,aAAa,EAAE,WAAW;gBAC1B,KAAK,EAAE,wBAAwB;aAChC,CAAC;QACJ,CAAC;QAED,IAAI,CAAC;YACH,2BAA2B;YAC3B,MAAM,aAAa,GAAG,IAAA,iCAAgB,EAAC,IAAI,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;YAClE,MAAM,YAAY,GAAsB,EAAE,CAAC;YAE3C,KAAK,MAAM,QAAQ,IAAI,aAAa,EAAE,CAAC;gBACrC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAC7B,SAAS;gBACX,CAAC;gBAED,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;gBACnD,MAAM,YAAY,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,aAAa,EAAE,QAAQ,CAAC,CAAC;gBACjE,YAAY,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,OAAO,EAAE,CAAC,CAAC;YACrD,CAAC;YAED,wBAAwB;YACxB,MAAM,cAAc,GAAG,IAAI,CAAC,mBAAmB,CAC7C,QAAQ,EACR,YAAY,EACZ,SAAS,CAAC,cAAc,CACzB,CAAC;YAEF,kBAAkB;YAClB,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,IAAI,IAAI,CAAC,YAAY,CAAC;YACnD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,cAAc,EAAE,KAAK,CAAC,CAAC;YAE9D,iCAAiC;YACjC,MAAM,UAAU,GAAgB,CAAC,QAAQ,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACtE,IAAI,EAAE,WAAoB;gBAC1B,OAAO,EAAE,CAAC,CAAC,OAAO;gBAClB,IAAI,EAAE,CAAC,CAAC,IAAI;gBACZ,IAAI,EAAE,CAAC,CAAC,IAAI;gBACZ,QAAQ,EAAE,QAAQ,CAAC,QAAQ;gBAC3B,OAAO,EAAE,QAAQ,CAAC,SAAS;aAC5B,CAAC,CAAC,CAAC;YAEJ,OAAO;gBACL,MAAM,EAAE,QAAQ,CAAC,MAAM;gBACvB,KAAK,EAAE,QAAQ,CAAC,KAAK;gBACrB,UAAU;gBACV,aAAa,EAAE,WAAW;aAC3B,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO;gBACL,MAAM,EAAE,KAAK;gBACb,KAAK,EAAE,CAAC;gBACR,UAAU,EAAE,EAAE;gBACd,aAAa,EAAE,WAAW;gBAC1B,KAAK,EAAE,qBAAqB,KAAK,EAAE;aACpC,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACK,mBAAmB,CACzB,QAAsB,EACtB,YAA+B,EAC/B,YAAqB;QAErB,MAAM,YAAY,GAAG,YAAY;aAC9B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,CAAC,IAAI,aAAa,CAAC,CAAC,OAAO,UAAU,CAAC;aACzD,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,kBAAkB,GACtB,YAAY;YACZ;;gCAE0B,QAAQ,CAAC,QAAQ;uCACV,QAAQ,CAAC,QAAQ;;uCAEjB,CAAC;QAEpC,OAAO;UACD,QAAQ,CAAC,WAAW;;;UAGpB,QAAQ,CAAC,MAAM;;;UAGf,YAAY;;;QAGd,kBAAkB,EAAE,CAAC;IAC3B,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,UAAU,CACtB,MAAc,EACd,KAAa;QAEb,MAAM,MAAM,GAAG,qDAAqD,CAAC;QAErE,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,MAAM,EAAE;YACnC,MAAM,EAAE,MAAM;YACd,OAAO,EAAE;gBACP,cAAc,EAAE,kBAAkB;gBAClC,aAAa,EAAE,UAAU,IAAI,CAAC,QAAQ,EAAE;aACzC;YACD,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;gBACnB,KAAK;gBACL,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,iBAAiB,EAAE;oBAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE;iBAClC;gBACD,qBAAqB,EAAE,IAAI;gBAC3B,eAAe,EAAE,EAAE,IAAI,EAAE,aAAa,EAAE;aACzC,CAAC;SACH,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;YACjB,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CACb,4BAA4B,QAAQ,CAAC,MAAM,IAAI,SAAS,EAAE,CAC3D,CAAC;QACJ,CAAC;QAED,MAAM,IAAI,GAAG,CAAC,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAQ,CAAC;QAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,CAAC;QAElD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;QAChD,CAAC;QAED,sBAAsB;QACtB,IAAI,CAAC;YACH,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAmB,CAAC;YAE1D,kCAAkC;YAClC,IACE,CAAC,KAAK,CAAC,OAAO,CAAC,WAAW,CAAC,WAAW,CAAC;gBACvC,WAAW,CAAC,YAAY,IAAI,IAAI;gBAChC,WAAW,CAAC,OAAO,IAAI,IAAI,EAC3B,CAAC;gBACD,MAAM,IAAI,KAAK,CAAC,4BAA4B,CAAC,CAAC;YAChD,CAAC;YAED,qDAAqD;YACrD,2CAA2C;YAC3C,MAAM,UAAU,GAAG,WAAW,CAAC,WAAW;iBACvC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,MAAM,CAAC;iBAClC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACX,OAAO,EAAE,GAAG,CAAC,CAAC,SAAS,KAAK,CAAC,CAAC,WAAW,EAAE;aAC5C,CAAC,CAAC,CAAC;YAEN,8DAA8D;YAC9D,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,WAAW,CAAC,YAAY,IAAI,GAAG,CAAC;YAE1E,MAAM,QAAQ,GAAgB;gBAC5B,MAAM;gBACN,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,WAAW,CAAC,YAAY,CAAC,CAAC,EAAE,iCAAiC;gBAC5F,SAAS,EAAE,WAAW,CAAC,OAAO;gBAC9B,UAAU;aACX,CAAC;YAEF,OAAO,QAAQ,CAAC;QAClB,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,IAAI,KAAK,CACb,iCAAiC,KAAK,cAAc,OAAO,EAAE,CAC9D,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,MAAc,EAAE,KAAc;QAC5C,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YACnB,OAAO,+BAA+B,CAAC;QACzC,CAAC;QAED,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,MAAM,EAAE,KAAK,IAAI,IAAI,CAAC,YAAY,CAAC,CAAC;YACzE,OAAO,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;QACzC,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,UAAU,KAAK,EAAE,CAAC;QAC3B,CAAC;IACH,CAAC;CACF;AAhOD,8CAgOC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "coding-agent-benchmarks",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Testing coding agents (GitHub Copilot CLI, Claude Code, etc.) with your repo's context to evaluate their code generation quality.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|