@darkrishabh/bench-ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +333 -0
- package/dist/cli/app.d.ts +11 -0
- package/dist/cli/app.d.ts.map +1 -0
- package/dist/cli/app.js +48 -0
- package/dist/cli/app.js.map +1 -0
- package/dist/cli/components/DiffView.d.ts +5 -0
- package/dist/cli/components/DiffView.d.ts.map +1 -0
- package/dist/cli/components/DiffView.js +14 -0
- package/dist/cli/components/DiffView.js.map +1 -0
- package/dist/cli/components/EvalView.d.ts +6 -0
- package/dist/cli/components/EvalView.d.ts.map +1 -0
- package/dist/cli/components/EvalView.js +82 -0
- package/dist/cli/components/EvalView.js.map +1 -0
- package/dist/cli/components/Spinner.d.ts +4 -0
- package/dist/cli/components/Spinner.d.ts.map +1 -0
- package/dist/cli/components/Spinner.js +15 -0
- package/dist/cli/components/Spinner.js.map +1 -0
- package/dist/cli/index.d.ts +3 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +117 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/run-command.d.ts +11 -0
- package/dist/cli/run-command.d.ts.map +1 -0
- package/dist/cli/run-command.js +119 -0
- package/dist/cli/run-command.js.map +1 -0
- package/dist/engine/cost.d.ts +3 -0
- package/dist/engine/cost.d.ts.map +1 -0
- package/dist/engine/cost.js +52 -0
- package/dist/engine/cost.js.map +1 -0
- package/dist/engine/diff.d.ts +6 -0
- package/dist/engine/diff.d.ts.map +1 -0
- package/dist/engine/diff.js +43 -0
- package/dist/engine/diff.js.map +1 -0
- package/dist/engine/eval.d.ts +14 -0
- package/dist/engine/eval.d.ts.map +1 -0
- package/dist/engine/eval.js +194 -0
- package/dist/engine/eval.js.map +1 -0
- package/dist/engine/index.d.ts +15 -0
- package/dist/engine/index.d.ts.map +1 -0
- package/dist/engine/index.js +10 -0
- package/dist/engine/index.js.map +1 -0
- package/dist/engine/providers/base.d.ts +7 -0
- package/dist/engine/providers/base.d.ts.map +1 -0
- package/dist/engine/providers/base.js +2 -0
- package/dist/engine/providers/base.js.map +1 -0
- package/dist/engine/providers/claude.d.ts +15 -0
- package/dist/engine/providers/claude.d.ts.map +1 -0
- package/dist/engine/providers/claude.js +53 -0
- package/dist/engine/providers/claude.js.map +1 -0
- package/dist/engine/providers/minimax.d.ts +16 -0
- package/dist/engine/providers/minimax.d.ts.map +1 -0
- package/dist/engine/providers/minimax.js +67 -0
- package/dist/engine/providers/minimax.js.map +1 -0
- package/dist/engine/providers/ollama.d.ts +14 -0
- package/dist/engine/providers/ollama.d.ts.map +1 -0
- package/dist/engine/providers/ollama.js +60 -0
- package/dist/engine/providers/ollama.js.map +1 -0
- package/dist/engine/providers/openai-compatible.d.ts +19 -0
- package/dist/engine/providers/openai-compatible.d.ts.map +1 -0
- package/dist/engine/providers/openai-compatible.js +109 -0
- package/dist/engine/providers/openai-compatible.js.map +1 -0
- package/dist/engine/providers/subprocess.d.ts +55 -0
- package/dist/engine/providers/subprocess.d.ts.map +1 -0
- package/dist/engine/providers/subprocess.js +111 -0
- package/dist/engine/providers/subprocess.js.map +1 -0
- package/dist/engine/suite-loader.d.ts +11 -0
- package/dist/engine/suite-loader.d.ts.map +1 -0
- package/dist/engine/suite-loader.js +75 -0
- package/dist/engine/suite-loader.js.map +1 -0
- package/dist/engine/types.d.ts +104 -0
- package/dist/engine/types.d.ts.map +1 -0
- package/dist/engine/types.js +2 -0
- package/dist/engine/types.js.map +1 -0
- package/next-env.d.ts +6 -0
- package/next.config.ts +26 -0
- package/package.json +72 -0
- package/public/icon.svg +14 -0
- package/src/app/api/diff/route.ts +135 -0
- package/src/app/api/models/route.ts +96 -0
- package/src/app/api/suite/route.ts +314 -0
- package/src/app/globals.css +215 -0
- package/src/app/icon.svg +14 -0
- package/src/app/layout.tsx +44 -0
- package/src/app/opengraph-image.tsx +73 -0
- package/src/app/page.tsx +952 -0
- package/src/app/suite/layout.tsx +12 -0
- package/src/app/suite/page.tsx +206 -0
- package/src/app/twitter-image.tsx +1 -0
- package/src/components/BenchAiLogo.tsx +38 -0
- package/src/components/ComparePanel.tsx +643 -0
- package/src/components/ConfigPanel.tsx +809 -0
- package/src/components/MarkdownOutput.tsx +16 -0
- package/src/components/ModelResponseCard.tsx +313 -0
- package/src/components/QuickComparisonBar.tsx +184 -0
- package/src/components/ResponsesLineDiff.tsx +149 -0
- package/src/components/SettingsPanel.tsx +591 -0
- package/src/components/SuitePanel.tsx +875 -0
- package/src/lib/brand.ts +4 -0
- package/src/lib/config-yaml.ts +70 -0
- package/src/lib/consume-suite-sse.ts +70 -0
- package/src/lib/describe-judge.ts +23 -0
- package/src/lib/model-chip-palette.ts +9 -0
- package/src/lib/openai-model-list.ts +33 -0
- package/src/lib/provider-ui.ts +30 -0
- package/src/lib/resolve-credentials.ts +80 -0
- package/src/lib/run-history.ts +66 -0
- package/src/lib/simple-line-diff.ts +50 -0
- package/src/lib/storage.ts +100 -0
- package/src/lib/suite-judge-meta.ts +13 -0
- package/src/lib/suite-run-history.ts +81 -0
- package/src/types.ts +170 -0
- package/vercel.json +5 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { SuiteConfig } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Interpolate {{varName}} placeholders in a template string.
|
|
4
|
+
*/
|
|
5
|
+
export declare function interpolate(template: string, vars: Record<string, string>): string;
|
|
6
|
+
/**
|
|
7
|
+
* Parse a YAML string into a validated SuiteConfig.
|
|
8
|
+
* Throws a descriptive error if required fields are missing.
|
|
9
|
+
*/
|
|
10
|
+
export declare function parseSuiteConfig(yaml: string): SuiteConfig;
|
|
11
|
+
//# sourceMappingURL=suite-loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"suite-loader.d.ts","sourceRoot":"","sources":["../../src/engine/suite-loader.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,WAAW,EAAuB,MAAM,YAAY,CAAC;AAEnE;;GAEG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,MAAM,CAElF;AAED;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,WAAW,CAiC1D"}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import { load } from "js-yaml";
|
|
2
|
+
/**
|
|
3
|
+
* Interpolate {{varName}} placeholders in a template string.
|
|
4
|
+
*/
|
|
5
|
+
export function interpolate(template, vars) {
|
|
6
|
+
return template.replace(/\{\{(\w+)\}\}/g, (_, key) => vars[key] ?? `{{${key}}}`);
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Parse a YAML string into a validated SuiteConfig.
|
|
10
|
+
* Throws a descriptive error if required fields are missing.
|
|
11
|
+
*/
|
|
12
|
+
export function parseSuiteConfig(yaml) {
|
|
13
|
+
const raw = load(yaml);
|
|
14
|
+
if (!raw || typeof raw !== "object") {
|
|
15
|
+
throw new Error("Suite config must be a YAML object");
|
|
16
|
+
}
|
|
17
|
+
if (!Array.isArray(raw.prompts) || raw.prompts.length === 0) {
|
|
18
|
+
throw new Error("Suite config must have at least one entry under `prompts`");
|
|
19
|
+
}
|
|
20
|
+
if (!Array.isArray(raw.tests) || raw.tests.length === 0) {
|
|
21
|
+
throw new Error("Suite config must have at least one entry under `tests`");
|
|
22
|
+
}
|
|
23
|
+
const prompts = raw.prompts.map((p, i) => {
|
|
24
|
+
if (typeof p !== "string")
|
|
25
|
+
throw new Error(`prompts[${i}] must be a string`);
|
|
26
|
+
return p;
|
|
27
|
+
});
|
|
28
|
+
const tests = raw.tests.map((t, i) => {
|
|
29
|
+
if (typeof t !== "object" || t === null) {
|
|
30
|
+
throw new Error(`tests[${i}] must be an object`);
|
|
31
|
+
}
|
|
32
|
+
const tc = t;
|
|
33
|
+
const vars = (tc.vars ?? {});
|
|
34
|
+
const assert = tc.assert
|
|
35
|
+
? validateAssertions(tc.assert, i)
|
|
36
|
+
: undefined;
|
|
37
|
+
return { vars, assert };
|
|
38
|
+
});
|
|
39
|
+
return { prompts, tests };
|
|
40
|
+
}
|
|
41
|
+
function validateAssertions(raw, testIdx) {
|
|
42
|
+
return raw.map((a, i) => {
|
|
43
|
+
if (typeof a !== "object" || a === null) {
|
|
44
|
+
throw new Error(`tests[${testIdx}].assert[${i}] must be an object`);
|
|
45
|
+
}
|
|
46
|
+
const obj = a;
|
|
47
|
+
const type = obj.type;
|
|
48
|
+
switch (type) {
|
|
49
|
+
case "contains":
|
|
50
|
+
case "not-contains":
|
|
51
|
+
if (typeof obj.value !== "string") {
|
|
52
|
+
throw new Error(`tests[${testIdx}].assert[${i}]: "${type}" requires a string "value"`);
|
|
53
|
+
}
|
|
54
|
+
return { type, value: obj.value };
|
|
55
|
+
case "llm-rubric":
|
|
56
|
+
if (typeof obj.value !== "string") {
|
|
57
|
+
throw new Error(`tests[${testIdx}].assert[${i}]: "llm-rubric" requires a string "value" (the criterion)`);
|
|
58
|
+
}
|
|
59
|
+
return { type: "llm-rubric", value: obj.value };
|
|
60
|
+
case "latency":
|
|
61
|
+
if (typeof obj.threshold !== "number") {
|
|
62
|
+
throw new Error(`tests[${testIdx}].assert[${i}]: "latency" requires a numeric "threshold" (ms)`);
|
|
63
|
+
}
|
|
64
|
+
return { type: "latency", threshold: obj.threshold };
|
|
65
|
+
case "cost":
|
|
66
|
+
if (typeof obj.threshold !== "number") {
|
|
67
|
+
throw new Error(`tests[${testIdx}].assert[${i}]: "cost" requires a numeric "threshold" (USD)`);
|
|
68
|
+
}
|
|
69
|
+
return { type: "cost", threshold: obj.threshold };
|
|
70
|
+
default:
|
|
71
|
+
throw new Error(`tests[${testIdx}].assert[${i}]: unknown assertion type "${type}"`);
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
//# sourceMappingURL=suite-loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"suite-loader.js","sourceRoot":"","sources":["../../src/engine/suite-loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,SAAS,CAAC;AAG/B;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,QAAgB,EAAE,IAA4B;IACxE,OAAO,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,KAAK,GAAG,IAAI,CAAC,CAAC;AACnF,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY;IAC3C,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAA4B,CAAC;IAElD,IAAI,CAAC,GAAG,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QACpC,MAAM,IAAI,KAAK,CAAC,oCAAoC,CAAC,CAAC;IACxD,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5D,MAAM,IAAI,KAAK,CAAC,2DAA2D,CAAC,CAAC;IAC/E,CAAC;IAED,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,GAAG,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxD,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;IAC7E,CAAC;IAED,MAAM,OAAO,GAAa,GAAG,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,CAAU,EAAE,CAAS,EAAE,EAAE;QAClE,IAAI,OAAO,CAAC,KAAK,QAAQ;YAAE,MAAM,IAAI,KAAK,CAAC,WAAW,CAAC,oBAAoB,CAAC,CAAC;QAC7E,OAAO,CAAC,CAAC;IACX,CAAC,CAAC,CAAC;IAEH,MAAM,KAAK,GAAgB,GAAG,CAAC,KAAmB,CAAC,GAAG,CAAC,CAAC,CAAU,EAAE,CAAS,EAAE,EAAE;QAC/E,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CAAC,SAAS,CAAC,qBAAqB,CAAC,CAAC;QACnD,CAAC;QACD,MAAM,EAAE,GAAG,CAA4B,CAAC;QACxC,MAAM,IAAI,GAAG,CAAC,EAAE,CAAC,IAAI,IAAI,EAAE,CAA2B,CAAC;QACvD,MAAM,MAAM,GAAG,EAAE,CAAC,MAAM;YACtB,CAAC,CAAC,kBAAkB,CAAC,EAAE,CAAC,MAAmB,EAAE,CAAC,CAAC;YAC/C,CAAC,CAAC,SAAS,CAAC;QACd,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,CAAC;AAC5B,CAAC;AAED,SAAS,kBAAkB,CAAC,GAAc,EAAE,OAAe;IACzD,OAAO,GAAG,CAAC,GAAG,CAAC,CAAC,CAAU,EAAE,CAAS,EAAE,EAAE;QACvC,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,IAAI,EAAE,CAAC;YACxC,MAAM,IAAI,KAAK,CAAC,SAAS,OAAO,YAAY,CAAC,qBAAqB,CAAC,CAAC;QACtE,CAAC;QACD,MAAM,GAAG,GAAG,CAA4B,CAAC;QACzC,MAAM,IAAI,GAAG,GAAG,CAAC,IAAc,CAAC;QAEhC,QAAQ,IAAI,EAAE,CAAC;YACb,KAAK,UAAU,CAAC;YAChB,KAAK,cAAc;gBACjB,IAAI,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;oBAClC,MAAM,IAAI,KAAK,CAAC,SAAS,OAAO,YAAY,CAAC,OAAO,IAAI,6BAA6B,CAAC,CAAC;gBACzF,CAAC;gBACD,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAe,CAAC;YAEjD,KAAK,YAAY;gBACf,IAAI,OAAO,GAAG,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;oBAClC,MAAM,IAAI,KAAK,CAAC,SAAS,OAAO,YAAY,CAAC,2DAA2D,CAAC,CAAC;gBAC5G,CAAC;gBACD,OAAO,EAAE,IAAI,EAAE,YAAY,EAAE,KAAK,EAAE,GAAG,CAAC,KAAK,EAAe,CAAC;YAE/D,KAAK,SAAS;gBACZ,IAAI,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ,EAAE,CAAC;oBACtC,MAAM,IAAI,KAAK,CAAC,SAAS,OAAO,YAAY,CAAC,kDAAkD,CAAC,CAAC;gBACnG,CAAC;gBACD,OAAO,EAAE,IAAI,EAAE,SAAS,EAAE,SAAS,EAAE,GAAG,CAAC,SAAS,EAAe,CAAC;YAEpE,KAAK,MAAM;gBACT,IAAI,OAAO,GAAG,CAAC,SAAS,KAAK,QAAQ,EAAE,CAAC;oBACtC,MAAM,IAAI,KAAK,CAAC,SAAS,OAAO,YAAY,CAAC,gDAAgD,CAAC,CAAC;gBACjG,CAAC;gBACD,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,CAAC,SAAS,EAAe,CAAC;YAEjE;gBACE,MAAM,IAAI,KAAK,CAAC,SAAS,OAAO,YAAY,CAAC,8BAA8B,IAAI,GAAG,CAAC,CAAC;QACxF,CAAC;IACH,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/** Built-in provider names. The type is widened to `string` so that
|
|
2
|
+
* OpenAI-compatible providers (openai, groq, openrouter, etc.) can also
|
|
3
|
+
* flow through ProviderResult without a cast. */
|
|
4
|
+
export type ProviderName = string;
|
|
5
|
+
export interface ProviderConfig {
|
|
6
|
+
claude?: {
|
|
7
|
+
apiKey: string;
|
|
8
|
+
model?: string;
|
|
9
|
+
};
|
|
10
|
+
ollama?: {
|
|
11
|
+
baseUrl?: string;
|
|
12
|
+
model?: string;
|
|
13
|
+
};
|
|
14
|
+
minimax?: {
|
|
15
|
+
apiKey: string;
|
|
16
|
+
groupId: string;
|
|
17
|
+
model?: string;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
export interface ProviderResult {
|
|
21
|
+
provider: ProviderName;
|
|
22
|
+
model: string;
|
|
23
|
+
output: string;
|
|
24
|
+
latencyMs: number;
|
|
25
|
+
inputTokens: number;
|
|
26
|
+
outputTokens: number;
|
|
27
|
+
costUsd: number;
|
|
28
|
+
error?: string;
|
|
29
|
+
}
|
|
30
|
+
export interface DiffResult {
|
|
31
|
+
prompt: string;
|
|
32
|
+
results: ProviderResult[];
|
|
33
|
+
ranAt: string;
|
|
34
|
+
}
|
|
35
|
+
export interface RunOptions {
|
|
36
|
+
prompt: string;
|
|
37
|
+
providers: ProviderName[];
|
|
38
|
+
config: ProviderConfig;
|
|
39
|
+
runs?: number;
|
|
40
|
+
}
|
|
41
|
+
export type Assertion = {
|
|
42
|
+
type: "contains";
|
|
43
|
+
value: string;
|
|
44
|
+
} | {
|
|
45
|
+
type: "not-contains";
|
|
46
|
+
value: string;
|
|
47
|
+
} | {
|
|
48
|
+
type: "llm-rubric";
|
|
49
|
+
value: string;
|
|
50
|
+
} | {
|
|
51
|
+
type: "latency";
|
|
52
|
+
threshold: number;
|
|
53
|
+
} | {
|
|
54
|
+
type: "cost";
|
|
55
|
+
threshold: number;
|
|
56
|
+
};
|
|
57
|
+
export interface TestCase {
|
|
58
|
+
vars?: Record<string, string>;
|
|
59
|
+
assert?: Assertion[];
|
|
60
|
+
}
|
|
61
|
+
export interface SuiteConfig {
|
|
62
|
+
/** Prompt templates — use {{varName}} for interpolation */
|
|
63
|
+
prompts: string[];
|
|
64
|
+
tests: TestCase[];
|
|
65
|
+
}
|
|
66
|
+
export interface AssertionResult {
|
|
67
|
+
type: string;
|
|
68
|
+
pass: boolean;
|
|
69
|
+
/** 0–1; deterministic assertions are 0 or 1; llm-rubric is 0 or 1 */
|
|
70
|
+
score: number;
|
|
71
|
+
reason?: string;
|
|
72
|
+
/** Set for llm-rubric: the criterion text from the suite YAML */
|
|
73
|
+
rubricCriterion?: string;
|
|
74
|
+
}
|
|
75
|
+
export interface ProviderTestResult extends ProviderResult {
|
|
76
|
+
assertions: AssertionResult[];
|
|
77
|
+
/** true only if every assertion passed */
|
|
78
|
+
pass: boolean;
|
|
79
|
+
/** fraction of assertions that passed (0–1) */
|
|
80
|
+
score: number;
|
|
81
|
+
}
|
|
82
|
+
export interface TestCaseResult {
|
|
83
|
+
/** Interpolated prompt */
|
|
84
|
+
prompt: string;
|
|
85
|
+
vars: Record<string, string>;
|
|
86
|
+
providerResults: ProviderTestResult[];
|
|
87
|
+
ranAt: string;
|
|
88
|
+
}
|
|
89
|
+
export interface ProviderSummary {
|
|
90
|
+
provider: ProviderName;
|
|
91
|
+
model: string;
|
|
92
|
+
passed: number;
|
|
93
|
+
failed: number;
|
|
94
|
+
total: number;
|
|
95
|
+
/** fraction of test cases passed (0–1) */
|
|
96
|
+
score: number;
|
|
97
|
+
avgLatencyMs: number;
|
|
98
|
+
totalCostUsd: number;
|
|
99
|
+
}
|
|
100
|
+
export interface SuiteResult {
|
|
101
|
+
cases: TestCaseResult[];
|
|
102
|
+
summary: ProviderSummary[];
|
|
103
|
+
}
|
|
104
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/engine/types.ts"],"names":[],"mappings":"AAAA;;kDAEkD;AAClD,MAAM,MAAM,YAAY,GAAG,MAAM,CAAC;AAElC,MAAM,WAAW,cAAc;IAC7B,MAAM,CAAC,EAAE;QACP,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,MAAM,CAAC,EAAE;QACP,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,OAAO,CAAC,EAAE;QACR,MAAM,EAAE,MAAM,CAAC;QACf,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC;CACH;AAED,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,YAAY,CAAC;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,cAAc,EAAE,CAAC;IAC1B,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,UAAU;IACzB,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,YAAY,EAAE,CAAC;IAC1B,MAAM,EAAE,cAAc,CAAC;IACvB,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAID,MAAM,MAAM,SAAS,GACjB;IAAE,IAAI,EAAE,UAAU,CAAC;IAAK,KAAK,EAAE,MAAM,CAAA;CAAE,GACvC;IAAE,IAAI,EAAE,cAAc,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,GACvC;IAAE,IAAI,EAAE,YAAY,CAAC;IAAG,KAAK,EAAE,MAAM,CAAA;CAAE,GACvC;IAAE,IAAI,EAAE,SAAS,CAAC;IAAM,SAAS,EAAE,MAAM,CAAA;CAAE,GAC3C;IAAE,IAAI,EAAE,MAAM,CAAC;IAAS,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC;AAEhD,MAAM,WAAW,QAAQ;IACvB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC9B,MAAM,CAAC,EAAE,SAAS,EAAE,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,2DAA2D;IAC3D,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,KAAK,EAAE,QAAQ,EAAE,CAAC;CACnB;AAED,MAAM,WAAW,eAAe;IAC9B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,OAAO,CAAC;IACd,qEAAqE;IACrE,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,iEAAiE;IACjE,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,kBAAmB,SAAQ,cAAc;IACxD,UAAU,EAAE,eAAe,EAAE,CAAC;IAC9B,0CAA0C;IAC1C,IAAI,EAAE,OAAO,CAAC;IACd,+CAA+C;IAC/C,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,cAAc;IAC7B,0BAA0B;IAC1B,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC7B,eAAe,EAAE,kBAAkB,EAAE,CAAC;IACtC,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,YAAY,CAAC;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,0CAA0C;IAC1C,KAAK,EAAE,MAAM,CAAC;IACd,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,OAAO,EAAE,eAAe,EAAE,CAAC;CAC5B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/engine/types.ts"],"names":[],"mappings":""}
|
package/next-env.d.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/// <reference types="next" />
|
|
2
|
+
/// <reference types="next/image-types/global" />
|
|
3
|
+
/// <reference path="./.next/types/routes.d.ts" />
|
|
4
|
+
|
|
5
|
+
// NOTE: This file should not be edited
|
|
6
|
+
// see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
|
package/next.config.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import type { NextConfig } from "next";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
|
|
5
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
|
|
7
|
+
const nextConfig: NextConfig = {
|
|
8
|
+
outputFileTracingRoot: __dirname,
|
|
9
|
+
webpack(config, { isServer }) {
|
|
10
|
+
config.resolve.alias = {
|
|
11
|
+
...config.resolve.alias,
|
|
12
|
+
// Same package: resolve engine via emitted JS (src uses .js specifiers).
|
|
13
|
+
"@darkrishabh/bench-ai": path.resolve(__dirname, "dist/engine/index.js"),
|
|
14
|
+
};
|
|
15
|
+
if (!isServer) {
|
|
16
|
+
config.resolve.fallback = {
|
|
17
|
+
...config.resolve.fallback,
|
|
18
|
+
child_process: false,
|
|
19
|
+
util: false,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
return config;
|
|
23
|
+
},
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
export default nextConfig;
|
package/package.json
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@darkrishabh/bench-ai",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Bench AI — compare LLM outputs across models: CLI, web UI, and YAML eval suites.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"bench-ai": "./dist/cli/index.js"
|
|
8
|
+
},
|
|
9
|
+
"main": "./dist/engine/index.js",
|
|
10
|
+
"types": "./dist/engine/index.d.ts",
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./dist/engine/index.js",
|
|
14
|
+
"types": "./dist/engine/index.d.ts"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"files": [
|
|
18
|
+
"dist",
|
|
19
|
+
"src/app",
|
|
20
|
+
"src/components",
|
|
21
|
+
"src/lib",
|
|
22
|
+
"src/types.ts",
|
|
23
|
+
"public",
|
|
24
|
+
"next.config.ts",
|
|
25
|
+
"next-env.d.ts",
|
|
26
|
+
"vercel.json",
|
|
27
|
+
"README.md"
|
|
28
|
+
],
|
|
29
|
+
"publishConfig": {
|
|
30
|
+
"access": "public"
|
|
31
|
+
},
|
|
32
|
+
"repository": {
|
|
33
|
+
"type": "git",
|
|
34
|
+
"url": "https://github.com/darkrishabh/bench-ai.git"
|
|
35
|
+
},
|
|
36
|
+
"bugs": {
|
|
37
|
+
"url": "https://github.com/darkrishabh/bench-ai/issues"
|
|
38
|
+
},
|
|
39
|
+
"homepage": "https://github.com/darkrishabh/bench-ai#readme",
|
|
40
|
+
"scripts": {
|
|
41
|
+
"predev": "npm run build:engine",
|
|
42
|
+
"build:engine": "tsc -p tsconfig.build.json",
|
|
43
|
+
"dev": "concurrently -k -n engine,web \"tsc -p tsconfig.build.json --watch\" \"next dev\"",
|
|
44
|
+
"build": "npm run build:engine && next build",
|
|
45
|
+
"start": "next start",
|
|
46
|
+
"type-check": "npm run build:engine && tsc -p tsconfig.json --noEmit",
|
|
47
|
+
"clean": "rm -rf dist .next"
|
|
48
|
+
},
|
|
49
|
+
"dependencies": {
|
|
50
|
+
"@anthropic-ai/sdk": "^0.36.3",
|
|
51
|
+
"commander": "^12.1.0",
|
|
52
|
+
"ink": "^5.0.1",
|
|
53
|
+
"js-yaml": "^4.1.1",
|
|
54
|
+
"next": "^15.5.15",
|
|
55
|
+
"react": "^18.3.1",
|
|
56
|
+
"react-dom": "^18.3.1",
|
|
57
|
+
"react-markdown": "^9.0.1",
|
|
58
|
+
"remark-gfm": "^4.0.0"
|
|
59
|
+
},
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@types/js-yaml": "^4.0.9",
|
|
62
|
+
"@types/node": "^22.10.5",
|
|
63
|
+
"@types/react": "^18.3.12",
|
|
64
|
+
"@types/react-dom": "^18.3.1",
|
|
65
|
+
"concurrently": "^9.1.2",
|
|
66
|
+
"typescript": "^5.7.3"
|
|
67
|
+
},
|
|
68
|
+
"engines": {
|
|
69
|
+
"node": ">=18"
|
|
70
|
+
},
|
|
71
|
+
"packageManager": "npm@10.9.2"
|
|
72
|
+
}
|
package/public/icon.svg
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 32 32" role="img" aria-label="Bench AI">
|
|
2
|
+
<defs>
|
|
3
|
+
<linearGradient id="bg" x1="4" y1="2" x2="28" y2="30" gradientUnits="userSpaceOnUse">
|
|
4
|
+
<stop stop-color="#1e40af"/>
|
|
5
|
+
<stop offset="1" stop-color="#172554"/>
|
|
6
|
+
</linearGradient>
|
|
7
|
+
</defs>
|
|
8
|
+
<rect width="32" height="32" rx="8" fill="url(#bg)"/>
|
|
9
|
+
<!-- Side-by-side panels = compare -->
|
|
10
|
+
<rect x="7" y="9" width="7" height="14" rx="2" fill="#ffffff" fill-opacity="0.95"/>
|
|
11
|
+
<rect x="18" y="9" width="7" height="14" rx="2" fill="#ffffff" fill-opacity="0.78"/>
|
|
12
|
+
<!-- Subtle divider -->
|
|
13
|
+
<rect x="15" y="8" width="2" height="16" rx="1" fill="#ffffff" fill-opacity="0.35"/>
|
|
14
|
+
</svg>
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import { NextRequest, NextResponse } from "next/server";
|
|
2
|
+
import {
|
|
3
|
+
ClaudeProvider,
|
|
4
|
+
OllamaProvider,
|
|
5
|
+
MinimaxProvider,
|
|
6
|
+
OpenAICompatibleProvider,
|
|
7
|
+
createClaudeCLIProvider,
|
|
8
|
+
createCodexProvider,
|
|
9
|
+
} from "@darkrishabh/bench-ai";
|
|
10
|
+
import type { ProviderResult } from "@darkrishabh/bench-ai";
|
|
11
|
+
import type { LLMInstance, WebProviderResult } from "@/types";
|
|
12
|
+
import { PRESET_BASE_URLS } from "@/types";
|
|
13
|
+
|
|
14
|
+
export async function POST(req: NextRequest) {
|
|
15
|
+
const body = (await req.json()) as {
|
|
16
|
+
prompt?: string;
|
|
17
|
+
instances?: LLMInstance[];
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
if (!body.prompt?.trim()) {
|
|
21
|
+
return NextResponse.json({ error: "prompt is required" }, { status: 400 });
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const enabled = (body.instances ?? []).filter((i) => i.enabled);
|
|
25
|
+
if (enabled.length === 0) {
|
|
26
|
+
return NextResponse.json({ error: "no enabled instances" }, { status: 400 });
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const results: WebProviderResult[] = await Promise.all(
|
|
30
|
+
enabled.map(async (instance): Promise<WebProviderResult> => {
|
|
31
|
+
const label = `${instance.provider} / ${instance.model}`;
|
|
32
|
+
let base: ProviderResult;
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
switch (instance.provider) {
|
|
36
|
+
// ── Native providers ────────────────────────────────────────────
|
|
37
|
+
case "claude": {
|
|
38
|
+
const apiKey = instance.apiKey?.trim() || process.env.ANTHROPIC_API_KEY;
|
|
39
|
+
if (!apiKey) throw new Error("No API key — add one in Configure or set ANTHROPIC_API_KEY");
|
|
40
|
+
const p = new ClaudeProvider(apiKey, instance.model, {
|
|
41
|
+
maxTokens: instance.maxTokens,
|
|
42
|
+
temperature: instance.temperature,
|
|
43
|
+
});
|
|
44
|
+
base = await p.complete(body.prompt!);
|
|
45
|
+
break;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
case "ollama": {
|
|
49
|
+
const p = new OllamaProvider(
|
|
50
|
+
instance.baseUrl || process.env.OLLAMA_BASE_URL || "http://localhost:11434",
|
|
51
|
+
instance.model,
|
|
52
|
+
{ temperature: instance.temperature }
|
|
53
|
+
);
|
|
54
|
+
base = await p.complete(body.prompt!);
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
case "claude-cli": {
|
|
59
|
+
const p = createClaudeCLIProvider(instance.model, { timeoutMs: 120_000 });
|
|
60
|
+
base = await p.complete(body.prompt!);
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
case "codex": {
|
|
65
|
+
const p = createCodexProvider(instance.model, { timeoutMs: 120_000 });
|
|
66
|
+
base = await p.complete(body.prompt!);
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
case "minimax": {
|
|
71
|
+
const apiKey = instance.apiKey?.trim() || process.env.MINIMAX_API_KEY;
|
|
72
|
+
const groupId = instance.groupId?.trim() || process.env.MINIMAX_GROUP_ID;
|
|
73
|
+
if (!apiKey || !groupId) throw new Error("Missing Minimax credentials — add API Key and Group ID in Configure");
|
|
74
|
+
const p = new MinimaxProvider(apiKey, groupId, instance.model, {
|
|
75
|
+
maxTokens: instance.maxTokens,
|
|
76
|
+
temperature: instance.temperature,
|
|
77
|
+
});
|
|
78
|
+
base = await p.complete(body.prompt!);
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// ── OpenAI-compatible providers ─────────────────────────────────
|
|
83
|
+
default: {
|
|
84
|
+
const apiKey = instance.apiKey?.trim() || process.env[`${instance.provider.toUpperCase().replace(/-/g, "_")}_API_KEY`] || "";
|
|
85
|
+
const baseUrl =
|
|
86
|
+
instance.baseUrl?.trim() ||
|
|
87
|
+
PRESET_BASE_URLS[instance.provider as keyof typeof PRESET_BASE_URLS] ||
|
|
88
|
+
"";
|
|
89
|
+
|
|
90
|
+
if (!baseUrl) throw new Error(`No base URL configured for "${instance.provider}"`);
|
|
91
|
+
|
|
92
|
+
// OpenRouter recommends an HTTP-Referer header
|
|
93
|
+
const extraHeaders: Record<string, string> =
|
|
94
|
+
instance.provider === "openrouter"
|
|
95
|
+
? { "HTTP-Referer": "https://github.com/darkrishabh/bench-ai", "X-Title": "Bench AI" }
|
|
96
|
+
: {};
|
|
97
|
+
|
|
98
|
+
const p = new OpenAICompatibleProvider(
|
|
99
|
+
instance.provider,
|
|
100
|
+
baseUrl,
|
|
101
|
+
apiKey,
|
|
102
|
+
instance.model,
|
|
103
|
+
{
|
|
104
|
+
maxTokens: instance.maxTokens,
|
|
105
|
+
temperature: instance.temperature,
|
|
106
|
+
extraHeaders,
|
|
107
|
+
}
|
|
108
|
+
);
|
|
109
|
+
base = await p.complete(body.prompt!);
|
|
110
|
+
break;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
} catch (err) {
|
|
114
|
+
base = {
|
|
115
|
+
provider: instance.provider,
|
|
116
|
+
model: instance.model,
|
|
117
|
+
output: "",
|
|
118
|
+
latencyMs: 0,
|
|
119
|
+
inputTokens: 0,
|
|
120
|
+
outputTokens: 0,
|
|
121
|
+
costUsd: 0,
|
|
122
|
+
error: err instanceof Error ? err.message : String(err),
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return { ...base, instanceId: instance.id, label };
|
|
127
|
+
})
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
return NextResponse.json({
|
|
131
|
+
prompt: body.prompt,
|
|
132
|
+
results,
|
|
133
|
+
ranAt: new Date().toISOString(),
|
|
134
|
+
});
|
|
135
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { NextRequest, NextResponse } from "next/server";
|
|
2
|
+
import { PRESET_MODELS } from "@/types";
|
|
3
|
+
import { filterOpenAiChatModelIds } from "@/lib/openai-model-list";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* GET /api/models?provider=ollama&baseUrl=http://localhost:11434
|
|
7
|
+
*
|
|
8
|
+
* Proxies model-listing APIs so the browser doesn't have to deal with CORS
|
|
9
|
+
* or mixed-content issues when the target is a local server.
|
|
10
|
+
*/
|
|
11
|
+
export async function GET(req: NextRequest) {
|
|
12
|
+
const { searchParams } = req.nextUrl;
|
|
13
|
+
const provider = searchParams.get("provider");
|
|
14
|
+
const baseUrl = (searchParams.get("baseUrl") ?? "").replace(/\/$/, "");
|
|
15
|
+
|
|
16
|
+
try {
|
|
17
|
+
switch (provider) {
|
|
18
|
+
case "ollama": {
|
|
19
|
+
const url = `${baseUrl || "http://localhost:11434"}/api/tags`;
|
|
20
|
+
const res = await fetch(url, { signal: AbortSignal.timeout(5_000) });
|
|
21
|
+
if (!res.ok) throw new Error(`Ollama ${res.status}`);
|
|
22
|
+
const data = (await res.json()) as { models?: Array<{ name: string }> };
|
|
23
|
+
const models = (data.models ?? []).map((m) => m.name).sort();
|
|
24
|
+
return NextResponse.json({ models });
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
default:
|
|
28
|
+
return NextResponse.json({ error: `No dynamic model list for "${provider}"` }, { status: 400 });
|
|
29
|
+
}
|
|
30
|
+
} catch (err) {
|
|
31
|
+
return NextResponse.json(
|
|
32
|
+
{ error: err instanceof Error ? err.message : String(err) },
|
|
33
|
+
{ status: 502 }
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const OPENAI_FALLBACK = PRESET_MODELS.openai;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* POST /api/models — list OpenAI chat models (requires API key from body or OPENAI_API_KEY on server).
|
|
42
|
+
*/
|
|
43
|
+
export async function POST(req: NextRequest) {
|
|
44
|
+
const body = (await req.json()) as {
|
|
45
|
+
provider?: string;
|
|
46
|
+
apiKey?: string;
|
|
47
|
+
baseUrl?: string;
|
|
48
|
+
};
|
|
49
|
+
|
|
50
|
+
if (body.provider !== "openai") {
|
|
51
|
+
return NextResponse.json({ error: "Only provider \"openai\" is supported for POST" }, { status: 400 });
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const key = body.apiKey?.trim() || process.env.OPENAI_API_KEY?.trim();
|
|
55
|
+
const base = (body.baseUrl?.trim() || "https://api.openai.com/v1").replace(/\/$/, "");
|
|
56
|
+
|
|
57
|
+
if (!key) {
|
|
58
|
+
return NextResponse.json({ models: OPENAI_FALLBACK, source: "preset" as const });
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
const res = await fetch(`${base}/models`, {
|
|
63
|
+
headers: { Authorization: `Bearer ${key}` },
|
|
64
|
+
signal: AbortSignal.timeout(12_000),
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
if (!res.ok) {
|
|
68
|
+
const text = await res.text().catch(() => "");
|
|
69
|
+
return NextResponse.json({
|
|
70
|
+
models: OPENAI_FALLBACK,
|
|
71
|
+
source: "preset" as const,
|
|
72
|
+
error: `OpenAI ${res.status}${text ? `: ${text.slice(0, 220)}` : ""}`,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const data = (await res.json()) as { data?: Array<{ id: string }> };
|
|
77
|
+
const raw = (data.data ?? []).map((m) => m.id);
|
|
78
|
+
const models = filterOpenAiChatModelIds(raw);
|
|
79
|
+
|
|
80
|
+
if (models.length === 0) {
|
|
81
|
+
return NextResponse.json({
|
|
82
|
+
models: OPENAI_FALLBACK,
|
|
83
|
+
source: "preset" as const,
|
|
84
|
+
error: "No chat-capable models returned from API",
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return NextResponse.json({ models, source: "api" as const });
|
|
89
|
+
} catch (err) {
|
|
90
|
+
return NextResponse.json({
|
|
91
|
+
models: OPENAI_FALLBACK,
|
|
92
|
+
source: "preset" as const,
|
|
93
|
+
error: err instanceof Error ? err.message : String(err),
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
}
|