coding-agent-benchmarks 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -18
- package/dist/adapters/claudeCodeCLI.d.ts +4 -0
- package/dist/adapters/claudeCodeCLI.d.ts.map +1 -1
- package/dist/adapters/claudeCodeCLI.js +6 -0
- package/dist/adapters/claudeCodeCLI.js.map +1 -1
- package/dist/adapters/copilotCLI.d.ts +4 -0
- package/dist/adapters/copilotCLI.d.ts.map +1 -1
- package/dist/adapters/copilotCLI.js +6 -0
- package/dist/adapters/copilotCLI.js.map +1 -1
- package/dist/evaluator.d.ts +20 -2
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +50 -51
- package/dist/evaluator.js.map +1 -1
- package/dist/reporter.d.ts +43 -0
- package/dist/reporter.d.ts.map +1 -0
- package/dist/reporter.js +281 -0
- package/dist/reporter.js.map +1 -0
- package/dist/runner.js +80 -65
- package/dist/runner.js.map +1 -1
- package/dist/types.d.ts +12 -0
- package/dist/types.d.ts.map +1 -1
- package/dist/utils/baselineManager.d.ts +2 -2
- package/dist/utils/baselineManager.d.ts.map +1 -1
- package/dist/utils/baselineManager.js +2 -2
- package/dist/utils/baselineManager.js.map +1 -1
- package/package.json +21 -4
|
@@ -20,7 +20,7 @@ export declare class BaselineManager {
|
|
|
20
20
|
/**
|
|
21
21
|
* Save a baseline result
|
|
22
22
|
*/
|
|
23
|
-
saveBaseline(result: EvaluationResult, adapter: AdapterType, model
|
|
23
|
+
saveBaseline(result: EvaluationResult, adapter: AdapterType, model: string): void;
|
|
24
24
|
/**
|
|
25
25
|
* Load a baseline result
|
|
26
26
|
*/
|
|
@@ -28,7 +28,7 @@ export declare class BaselineManager {
|
|
|
28
28
|
/**
|
|
29
29
|
* Compare current result with baseline
|
|
30
30
|
*/
|
|
31
|
-
compareWithBaseline(result: EvaluationResult, adapter: AdapterType, model
|
|
31
|
+
compareWithBaseline(result: EvaluationResult, adapter: AdapterType, model: string): {
|
|
32
32
|
baselineScore: number;
|
|
33
33
|
delta: number;
|
|
34
34
|
isImprovement: boolean;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"baselineManager.d.ts","sourceRoot":"","sources":["../../src/utils/baselineManager.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAEpE,MAAM,WAAW,YAAY;IAC3B,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,eAAe;IAC1B,OAAO,CAAC,WAAW,CAAS;gBAEhB,aAAa,EAAE,MAAM;IAIjC;;OAEG;IACH,OAAO,CAAC,eAAe;IAQvB;;OAEG;IACH,YAAY,CACV,MAAM,EAAE,gBAAgB,EACxB,OAAO,EAAE,WAAW,EACpB,KAAK,
|
|
1
|
+
{"version":3,"file":"baselineManager.d.ts","sourceRoot":"","sources":["../../src/utils/baselineManager.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,OAAO,EAAE,WAAW,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,UAAU,CAAC;AAEpE,MAAM,WAAW,YAAY;IAC3B,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,SAAS,EAAE,CAAC;IACxB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,WAAW,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,eAAe;IAC1B,OAAO,CAAC,WAAW,CAAS;gBAEhB,aAAa,EAAE,MAAM;IAIjC;;OAEG;IACH,OAAO,CAAC,eAAe;IAQvB;;OAEG;IACH,YAAY,CACV,MAAM,EAAE,gBAAgB,EACxB,OAAO,EAAE,WAAW,EACpB,KAAK,EAAE,MAAM,GACZ,IAAI;IAsBP;;OAEG;IACH,YAAY,CACV,OAAO,EAAE,WAAW,EACpB,KAAK,EAAE,MAAM,EACb,UAAU,EAAE,MAAM,GACjB,YAAY,GAAG,IAAI;IAgBtB;;OAEG;IACH,mBAAmB,CACjB,MAAM,EAAE,gBAAgB,EACxB,OAAO,EAAE,WAAW,EACpB,KAAK,EAAE,MAAM,GACZ;QACD,aAAa,EAAE,MAAM,CAAC;QACtB,KAAK,EAAE,MAAM,CAAC;QACd,aAAa,EAAE,OAAO,CAAC;KACxB,GAAG,IAAI;IAiBR;;OAEG;IACH,aAAa,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,YAAY,EAAE;IA4BnE;;OAEG;IACH,OAAO,CAAC,uBAAuB;IAe/B;;OAEG;IACH,cAAc,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,MAAM,GAAG,OAAO;IAgBhF;;OAEG;IACH,kBAAkB,CAAC,OAAO,EAAE,WAAW,EAAE,KAAK,CAAC,EAAE,MAAM,GAAG,MAAM;CA8CjE"}
|
|
@@ -52,7 +52,7 @@ class BaselineManager {
|
|
|
52
52
|
/**
|
|
53
53
|
* Save a baseline result
|
|
54
54
|
*/
|
|
55
|
-
saveBaseline(result, adapter, model
|
|
55
|
+
saveBaseline(result, adapter, model) {
|
|
56
56
|
const baseline = {
|
|
57
57
|
scenarioId: result.scenario.id,
|
|
58
58
|
score: result.score,
|
|
@@ -90,7 +90,7 @@ class BaselineManager {
|
|
|
90
90
|
/**
|
|
91
91
|
* Compare current result with baseline
|
|
92
92
|
*/
|
|
93
|
-
compareWithBaseline(result, adapter, model
|
|
93
|
+
compareWithBaseline(result, adapter, model) {
|
|
94
94
|
const baseline = this.loadBaseline(adapter, model, result.scenario.id);
|
|
95
95
|
if (!baseline) {
|
|
96
96
|
return null;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"baselineManager.js","sourceRoot":"","sources":["../../src/utils/baselineManager.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,uCAAyB;AACzB,2CAA6B;AAY7B,MAAa,eAAe;IAG1B,YAAY,aAAqB;QAC/B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,aAAa,EAAE,WAAW,CAAC,CAAC;IAC1E,CAAC;IAED;;OAEG;IACK,eAAe,CACrB,OAAoB,EACpB,KAAa,EACb,UAAkB;QAElB,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,UAAU,OAAO,CAAC,CAAC;IAC3E,CAAC;IAED;;OAEG;IACH,YAAY,CACV,MAAwB,EACxB,OAAoB,EACpB,
|
|
1
|
+
{"version":3,"file":"baselineManager.js","sourceRoot":"","sources":["../../src/utils/baselineManager.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAEH,uCAAyB;AACzB,2CAA6B;AAY7B,MAAa,eAAe;IAG1B,YAAY,aAAqB;QAC/B,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,aAAa,EAAE,WAAW,CAAC,CAAC;IAC1E,CAAC;IAED;;OAEG;IACK,eAAe,CACrB,OAAoB,EACpB,KAAa,EACb,UAAkB;QAElB,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,EAAE,KAAK,EAAE,GAAG,UAAU,OAAO,CAAC,CAAC;IAC3E,CAAC;IAED;;OAEG;IACH,YAAY,CACV,MAAwB,EACxB,OAAoB,EACpB,KAAa;QAEb,MAAM,QAAQ,GAAiB;YAC7B,UAAU,EAAE,MAAM,CAAC,QAAQ,CAAC,EAAE;YAC9B,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,UAAU,EAAE,MAAM,CAAC,UAAU;YAC7B,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,OAAO;YACP,KAAK;SACN,CAAC;QAEF,MAAM,YAAY,GAAG,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QAC9E,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC;QAEvC,0BAA0B;QAC1B,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,EAAE,CAAC,SAAS,CAAC,GAAG,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QACzC,CAAC;QAED,sBAAsB;QACtB,EAAE,CAAC,aAAa,CAAC,YAAY,EAAE,IAAI,CAAC,SAAS,CAAC,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;IAC7E,CAAC;IAED;;OAEG;IACH,YAAY,CACV,OAAoB,EACpB,KAAa,EACb,UAAkB;QAElB,MAAM,YAAY,GAAG,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;QAEtE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;YACjC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;YACvD,OAAO,IAAI,CAAC,KAAK,CAAC,OAAO,CAAiB,CAAC;QAC7C,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,+BAA+B,UAAU,GAAG,EAAE,KAAK,CAAC,CAAC;YAClE,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,mBAAmB,CACjB,MAAwB,EACxB,OAAoB,EACpB,KAAa;QAMb,MAAM,QAAQ,GAAG,IAAI,CAAC,YAAY,CAAC,OAAO,EAAE,KAAK,EAAE,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QAEvE,IAAI,CAAC,QAAQ,EAAE,CAAC;YACd,OAAO,IAAI,CAAC;QACd,CAAC;QAED,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC;QAC5C,MAAM,aAAa,GAAG,KAAK,GAAG,CAAC,CAAC;QAEhC,OAAO;YACL,aAAa,EAAE,QAAQ,CAAC,KAAK;YAC7B,KAAK;YACL,aAAa;SACd,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,OAAoB,EAAE,KAAc;QAChD,MAAM,SAAS,GAAmB,EAAE,CAAC;QAErC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;QACxD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC/B,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,uDAAuD;QACvD,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC;YAC9C,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5B,IAAI,CAAC,uBAAuB,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YACpD,CAAC;QACH,CAAC;aAAM,CAAC;YACN,+BAA+B;YAC/B,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,UAAU,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;YACtE,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;gBAC5B,IAAI,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC;oBACtB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;oBACjD,IAAI,CAAC,uBAAuB,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;gBACpD,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;OAEG;IACK,uBAAuB,CAAC,GAAW,EAAE,SAAyB;QACpE,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC;QAClC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBAC3B,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;oBAC/D,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAiB,CAAC;oBACrD,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;gBAC3B,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,OAAO,CAAC,IAAI,CAAC,2BAA2B,IAAI,GAAG,EAAE,KAAK,CAAC,CAAC;gBAC1D,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,OAAoB,EAAE,KAAa,EAAE,UAAkB;QACpE,MAAM,YAAY,GAAG,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,CAAC,CAAC;QAEtE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,EAAE,CAAC;YACjC,OAAO,KAAK,CAAC;QACf,CAAC;QAED,IAAI,CAAC;YACH,EAAE,CAAC,UAAU,CAAC,YAAY,CAAC,CAAC;YAC5B,OAAO,IAAI,CAAC;QACd,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,IAAI,CAAC,iCAAiC,UAAU,GAAG,EAAE,KAAK,CAAC,CAAC;YACpE,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IAED;;OAEG;IACH,kBAAkB,CAAC,OAAoB,EAAE,KAAc;QACrD,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;QAExD,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC/B,OAAO,CAAC,CAAC;QACX,CAAC;QAED,IAAI,KAAK,EAAE,CAAC;YACV,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC;YAC9C,IAAI,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;gBAC5B,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;gBACvC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;oBACzB,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;wBAC3B,IAAI,CAAC;4BACH,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;4BACzC,KAAK,EAAE,CAAC;wBACV,CAAC;wBAAC,OAAO,KAAK,EAAE,CAAC;4BACf,OAAO,CAAC,IAAI,CAAC,oBAAoB,IAAI,GAAG,EAAE,KAAK,CAAC,CAAC;wBACnD,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;aAAM,CAAC;YACN,oBAAoB;YACpB,MAAM,SAAS,GAAG,EAAE,CAAC,WAAW,CAAC,UAAU,EAAE,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,CAAC;YACtE,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;gBAC5B,IAAI,GAAG,CAAC,WAAW,EAAE,EAAE,CAAC;oBACtB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;oBACjD,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;oBACvC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;wBACzB,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;4BAC3B,IAAI,CAAC;gCACH,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC;gCACzC,KAAK,EAAE,CAAC;4BACV,CAAC;4BAAC,OAAO,KAAK,EAAE,CAAC;gCACf,OAAO,CAAC,IAAI,CAAC,oBAAoB,IAAI,GAAG,EAAE,KAAK,CAAC,CAAC;4BACnD,CAAC;wBACH,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;CACF;AAvND,0CAuNC"}
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "coding-agent-benchmarks",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.2.0",
|
|
4
|
+
"description": "Testing coding agents (GitHub Copilot CLI, Claude Code, etc.) with your repo's context to evaluate their code generation quality.",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
7
7
|
"bin": {
|
|
@@ -18,10 +18,24 @@
|
|
|
18
18
|
"benchmark",
|
|
19
19
|
"evaluation",
|
|
20
20
|
"copilot",
|
|
21
|
+
"github-copilot",
|
|
22
|
+
"claude",
|
|
21
23
|
"claude-code",
|
|
22
24
|
"ai",
|
|
25
|
+
"llm",
|
|
26
|
+
"openai",
|
|
27
|
+
"gpt",
|
|
23
28
|
"testing",
|
|
24
|
-
"code-quality"
|
|
29
|
+
"code-quality",
|
|
30
|
+
"code-generation",
|
|
31
|
+
"code-standards",
|
|
32
|
+
"lint",
|
|
33
|
+
"eslint",
|
|
34
|
+
"ci-cd",
|
|
35
|
+
"automation",
|
|
36
|
+
"developer-tools",
|
|
37
|
+
"prompt-testing",
|
|
38
|
+
"ai-testing"
|
|
25
39
|
],
|
|
26
40
|
"author": "chiItepin",
|
|
27
41
|
"license": "MIT",
|
|
@@ -38,12 +52,15 @@
|
|
|
38
52
|
"LICENSE"
|
|
39
53
|
],
|
|
40
54
|
"dependencies": {
|
|
41
|
-
"
|
|
55
|
+
"chalk": "^4.1.2",
|
|
56
|
+
"commander": "^13.1.0",
|
|
57
|
+
"log-update": "^4.0.0"
|
|
42
58
|
},
|
|
43
59
|
"devDependencies": {
|
|
44
60
|
"@types/node": "^25.0.9",
|
|
45
61
|
"eslint": "^9.39.2",
|
|
46
62
|
"prettier": "^3.8.0",
|
|
63
|
+
"tsup": "^8.5.1",
|
|
47
64
|
"tsx": "^4.21.0",
|
|
48
65
|
"typescript": "^5.9.3"
|
|
49
66
|
}
|