evalforge 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ export declare function findBinary(): string;
2
+ //# sourceMappingURL=binary.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"binary.d.ts","sourceRoot":"","sources":["../src/binary.ts"],"names":[],"mappings":"AAIA,wBAAgB,UAAU,IAAI,MAAM,CA2BnC"}
package/dist/binary.js ADDED
@@ -0,0 +1,67 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.findBinary = findBinary;
37
+ const child_process_1 = require("child_process");
38
+ const path = __importStar(require("path"));
39
+ const fs = __importStar(require("fs"));
40
+ function findBinary() {
41
+ // 1. EVALFORGE_BIN env var
42
+ if (process.env.EVALFORGE_BIN) {
43
+ return process.env.EVALFORGE_BIN;
44
+ }
45
+ // 2. Walk up from this file looking for target/debug/evalforge
46
+ let current = path.dirname(__filename);
47
+ for (let i = 0; i < 6; i++) {
48
+ const candidate = path.join(current, 'target', 'debug', 'evalforge');
49
+ if (fs.existsSync(candidate))
50
+ return candidate;
51
+ const release = path.join(current, 'target', 'release', 'evalforge');
52
+ if (fs.existsSync(release))
53
+ return release;
54
+ current = path.dirname(current);
55
+ }
56
+ // 3. On PATH
57
+ try {
58
+ const which = (0, child_process_1.execSync)('which evalforge', { encoding: 'utf8' }).trim();
59
+ if (which)
60
+ return which;
61
+ }
62
+ catch { }
63
+ throw new Error('EvalForge binary not found.\n' +
64
+ 'Option 1: git clone https://github.com/heManKuMAR6/evalforge && cargo build --release\n' +
65
+ 'Option 2: export EVALFORGE_BIN=/path/to/evalforge/binary');
66
+ }
67
+ //# sourceMappingURL=binary.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"binary.js","sourceRoot":"","sources":["../src/binary.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAIA,gCA2BC;AA/BD,iDAAyC;AACzC,2CAA6B;AAC7B,uCAAyB;AAEzB,SAAgB,UAAU;IACxB,2BAA2B;IAC3B,IAAI,OAAO,CAAC,GAAG,CAAC,aAAa,EAAE,CAAC;QAC9B,OAAO,OAAO,CAAC,GAAG,CAAC,aAAa,CAAC;IACnC,CAAC;IAED,+DAA+D;IAC/D,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;IACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3B,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,WAAW,CAAC,CAAC;QACrE,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,SAAS,CAAC;QAC/C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QACrE,IAAI,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC;YAAE,OAAO,OAAO,CAAC;QAC3C,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAClC,CAAC;IAED,aAAa;IACb,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,wBAAQ,EAAC,iBAAiB,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACvE,IAAI,KAAK;YAAE,OAAO,KAAK,CAAC;IAC1B,CAAC;IAAC,MAAM,CAAC,CAAA,CAAC;IAEV,MAAM,IAAI,KAAK,CACb,+BAA+B;QAC/B,yFAAyF;QACzF,0DAA0D,CAC3D,CAAC;AACJ,CAAC"}
@@ -0,0 +1,5 @@
1
+ import { EvalResult, RunOptions } from './types';
2
+ export { EvalResult, MetricResult, RunOptions, Trace } from './types';
3
+ export declare function run(tracePath: string, options: RunOptions): EvalResult;
4
+ export declare function demo(): EvalResult;
5
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,UAAU,EAAE,UAAU,EAAS,MAAM,SAAS,CAAC;AAExD,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,SAAS,CAAC;AAkCtE,wBAAgB,GAAG,CAAC,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,GAAG,UAAU,CAmBtE;AAED,wBAAgB,IAAI,IAAI,UAAU,CAuCjC"}
package/dist/index.js ADDED
@@ -0,0 +1,132 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.run = run;
37
+ exports.demo = demo;
38
+ const child_process_1 = require("child_process");
39
+ const fs = __importStar(require("fs"));
40
+ const os = __importStar(require("os"));
41
+ const path = __importStar(require("path"));
42
+ const binary_1 = require("./binary");
43
+ function parseOutput(output) {
44
+ const metrics = [];
45
+ const lines = output.split('\n');
46
+ let traceId = '';
47
+ let framework = '';
48
+ let overallPassed = true;
49
+ for (let i = 0; i < lines.length; i++) {
50
+ const line = lines[i].trim();
51
+ if (line.startsWith('Trace ID:'))
52
+ traceId = line.split(':')[1].trim();
53
+ if (line.startsWith('Framework:'))
54
+ framework = line.split(':')[1].trim();
55
+ if (line === 'Overall: FAIL')
56
+ overallPassed = false;
57
+ const metricMatch = line.match(/^(\w+)\s+([\d.]+)\s+(PASS|FAIL)$/);
58
+ if (metricMatch) {
59
+ const reason = lines[i + 1]?.trim().startsWith('Reason:')
60
+ ? lines[i + 1].replace('Reason:', '').trim()
61
+ : '';
62
+ metrics.push({
63
+ metric: metricMatch[1],
64
+ score: parseFloat(metricMatch[2]),
65
+ passed: metricMatch[3] === 'PASS',
66
+ reason,
67
+ });
68
+ }
69
+ }
70
+ return { traceId, framework, metrics, passed: overallPassed };
71
+ }
72
+ function run(tracePath, options) {
73
+ const binary = (0, binary_1.findBinary)();
74
+ const args = [
75
+ 'run',
76
+ '--trace', tracePath,
77
+ '--metrics', options.metrics.join(','),
78
+ '--threshold', String(options.threshold ?? 0.7),
79
+ ];
80
+ if (options.mock)
81
+ args.push('--mock');
82
+ if (options.rubric) {
83
+ args.push('--rubric');
84
+ args.push(options.rubric);
85
+ }
86
+ const env = { ...process.env };
87
+ if (options.apiKey)
88
+ env.ANTHROPIC_API_KEY = options.apiKey;
89
+ const result = (0, child_process_1.spawnSync)(binary, args, { encoding: 'utf8', env });
90
+ if (result.error)
91
+ throw result.error;
92
+ return parseOutput(result.stdout);
93
+ }
94
+ function demo() {
95
+ const trace = {
96
+ evalforge_version: '0.1',
97
+ trace_id: 'demo-js-001',
98
+ timestamp: new Date().toISOString(),
99
+ metadata: {
100
+ framework: 'openai-agents',
101
+ model: 'gpt-4o',
102
+ agent_name: 'demo-agent',
103
+ duration_ms: 1200,
104
+ total_tokens: 450,
105
+ },
106
+ input: {
107
+ user: 'What is the capital of Australia?',
108
+ system: 'You are a helpful assistant.',
109
+ },
110
+ steps: [
111
+ { step_id: 1, type: 'thought', content: 'I know this — Canberra.' },
112
+ {
113
+ step_id: 2,
114
+ type: 'tool_call',
115
+ tool: 'web_search',
116
+ input: { query: 'capital of Australia' },
117
+ output: { result: 'Canberra is the capital of Australia.' },
118
+ duration_ms: 400,
119
+ },
120
+ ],
121
+ output: { answer: 'The capital of Australia is Canberra.' },
122
+ eval_hints: {
123
+ expected_tools: ['web_search'],
124
+ expected_answer: 'Canberra',
125
+ context_documents: [],
126
+ },
127
+ };
128
+ const tmp = path.join(os.tmpdir(), `evalforge_demo_${Date.now()}.json`);
129
+ fs.writeFileSync(tmp, JSON.stringify(trace));
130
+ return run(tmp, { metrics: ['faithfulness'], mock: true });
131
+ }
132
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAyCA,kBAmBC;AAED,oBAuCC;AArGD,iDAA0C;AAC1C,uCAAyB;AACzB,uCAAyB;AACzB,2CAA6B;AAC7B,qCAAsC;AAKtC,SAAS,WAAW,CAAC,MAAc;IACjC,MAAM,OAAO,GAAG,EAAE,CAAC;IACnB,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEjC,IAAI,OAAO,GAAG,EAAE,CAAC;IACjB,IAAI,SAAS,GAAG,EAAE,CAAC;IACnB,IAAI,aAAa,GAAG,IAAI,CAAC;IAEzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAE7B,IAAI,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC;YAAE,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACtE,IAAI,IAAI,CAAC,UAAU,CAAC,YAAY,CAAC;YAAE,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QACzE,IAAI,IAAI,KAAK,eAAe;YAAE,aAAa,GAAG,KAAK,CAAC;QAEpD,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,kCAAkC,CAAC,CAAC;QACnE,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;gBACvD,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE;gBAC5C,CAAC,CAAC,EAAE,CAAC;YACP,OAAO,CAAC,IAAI,CAAC;gBACX,MAAM,EAAE,WAAW,CAAC,CAAC,CAAC;gBACtB,KAAK,EAAE,UAAU,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;gBACjC,MAAM,EAAE,WAAW,CAAC,CAAC,CAAC,KAAK,MAAM;gBACjC,MAAM;aACP,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,EAAE,MAAM,EAAE,aAAa,EAAE,CAAC;AAChE,CAAC;AAED,SAAgB,GAAG,CAAC,SAAiB,EAAE,OAAmB;IACxD,MAAM,MAAM,GAAG,IAAA,mBAAU,GAAE,CAAC;IAC5B,MAAM,IAAI,GAAG;QACX,KAAK;QACL,SAAS,EAAE,SAAS;QACpB,WAAW,EAAE,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC;QACtC,aAAa,EAAE,MAAM,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC;KAChD,CAAC;IAEF,IAAI,OAAO,CAAC,IAAI;QAAE,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACtC,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;QAAC,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAAC,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;IAAC,CAAC;IAEzE,MAAM,GAAG,GAAG,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,CAAC;IAC/B,IAAI,OAAO,CAAC,MAAM;QAAE,GAAG,CAAC,iBAAiB,GAAG,OAAO,CAAC,MAAM,CAAC;IAE3D,MAAM,MAAM,GAAG,IAAA,yBAAS,EAAC,MAAM,EAAE,IAAI,EAAE,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,EAAE,CAAC,CAAC;IAClE,IAAI,MAAM,CAAC,KAAK;QAAE,MAAM,MAAM,CAAC,KAAK,CAAC;IAErC,OAAO,WAAW,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;AACpC,CAAC;AAED,SAAgB,IAAI;IAClB,MAAM,KAAK,GAAU;QACnB,iBAAiB,EAAE,KAAK;QACxB,QAAQ,EAAE,aAAa;QACvB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,QAAQ,EAAE;YACR,SAAS,EAAE,eAAe;YAC1B,KAAK,EAAE,QAAQ;YACf,UAAU,EAAE,YAAY;YACxB,WAAW,EAAE,IAAI;YACjB,YAAY,EAAE,GAAG;SAClB;QACD,KAAK,EAAE;YACL,IAAI,EAAE,mCAAmC;YACzC,MAAM,EAAE,8BAA8B;SACvC;QACD,KAAK,EAAE;YACL,EAAE,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,SAAS,EAAE,OAAO,EAAE,yBAAyB,EAAE;YACnE;gBACE,OAAO,EAAE,CAAC;gBACV,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,YAAY;gBAClB,KAAK,EAAE,EAAE,KAAK,EAAE,sBAAsB,EAAE;gBACxC,MAAM,EAAE,EAAE,MAAM,EAAE,uCAAuC,EAAE;gBAC3D,WAAW,EAAE,GAAG;aACjB;SACF;QACD,MAAM,EAAE,EAAE,MAAM,EAAE,uCAAuC,EAAE;QAC3D,UAAU,EAAE;YACV,cAAc,EAAE,CAAC,YAAY,CAAC;YAC9B,eAAe,EAAE,UAAU;YAC3B,iBAAiB,EAAE,EAAE;SACtB;KACF,CAAC;IAEF,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,EAAE,EAAE,kBAAkB,IAAI,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IACxE,EAAE,CAAC,aAAa,CAAC,GAAG,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;IAE7C,OAAO,GAAG,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAAC,cAAc,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;AAC7D,CAAC"}
@@ -0,0 +1,54 @@
1
+ export interface MetricResult {
2
+ metric: string;
3
+ score: number;
4
+ passed: boolean;
5
+ reason: string;
6
+ }
7
+ export interface EvalResult {
8
+ traceId: string;
9
+ framework: string;
10
+ metrics: MetricResult[];
11
+ passed: boolean;
12
+ }
13
+ export interface RunOptions {
14
+ metrics: string[];
15
+ threshold?: number;
16
+ mock?: boolean;
17
+ apiKey?: string;
18
+ rubric?: string;
19
+ }
20
+ export interface Trace {
21
+ evalforge_version: string;
22
+ trace_id: string;
23
+ timestamp: string;
24
+ metadata: {
25
+ framework: string;
26
+ model: string;
27
+ agent_name: string;
28
+ duration_ms: number;
29
+ total_tokens: number;
30
+ };
31
+ input: {
32
+ user: string;
33
+ system: string;
34
+ };
35
+ steps: Array<{
36
+ step_id: number;
37
+ type: string;
38
+ content?: string;
39
+ tool?: string;
40
+ input?: Record<string, unknown>;
41
+ output?: Record<string, unknown>;
42
+ duration_ms?: number;
43
+ }>;
44
+ output: {
45
+ answer: string;
46
+ finish_reason?: string;
47
+ };
48
+ eval_hints: {
49
+ expected_tools: string[];
50
+ expected_answer: string | null;
51
+ context_documents: string[];
52
+ };
53
+ }
54
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,YAAY;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,OAAO,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,MAAM,WAAW,UAAU;IACzB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,YAAY,EAAE,CAAC;IACxB,MAAM,EAAE,OAAO,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,IAAI,CAAC,EAAE,OAAO,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,KAAK;IACpB,iBAAiB,EAAE,MAAM,CAAC;IAC1B,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE;QACR,SAAS,EAAE,MAAM,CAAC;QAClB,KAAK,EAAE,MAAM,CAAC;QACd,UAAU,EAAE,MAAM,CAAC;QACnB,WAAW,EAAE,MAAM,CAAC;QACpB,YAAY,EAAE,MAAM,CAAC;KACtB,CAAC;IACF,KAAK,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,EAAE,MAAM,CAAC;KAChB,CAAC;IACF,KAAK,EAAE,KAAK,CAAC;QACX,OAAO,EAAE,MAAM,CAAC;QAChB,IAAI,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,KAAK,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAChC,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACjC,WAAW,CAAC,EAAE,MAAM,CAAC;KACtB,CAAC,CAAC;IACH,MAAM,EAAE;QACN,MAAM,EAAE,MAAM,CAAC;QACf,aAAa,CAAC,EAAE,MAAM,CAAC;KACxB,CAAC;IACF,UAAU,EAAE;QACV,cAAc,EAAE,MAAM,EAAE,CAAC;QACzB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;QAC/B,iBAAiB,EAAE,MAAM,EAAE,CAAC;KAC7B,CAAC;CACH"}
package/dist/types.js ADDED
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
package/package.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "evalforge",
3
+ "version": "0.6.0",
4
+ "description": "Framework-agnostic LLM agent evaluation harness",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "scripts": {
8
+ "build": "tsc",
9
+ "test": "node --test tests/*.test.js",
10
+ "prepublishOnly": "npm run build"
11
+ },
12
+ "keywords": [
13
+ "llm",
14
+ "evaluation",
15
+ "agents",
16
+ "langchain",
17
+ "openai"
18
+ ],
19
+ "author": "heManKuMAR6",
20
+ "license": "MIT",
21
+ "repository": {
22
+ "type": "git",
23
+ "url": "https://github.com/heManKuMAR6/evalforge"
24
+ },
25
+ "devDependencies": {
26
+ "@types/node": "^25.5.2",
27
+ "typescript": "^5.0.0"
28
+ }
29
+ }
package/src/.gitkeep ADDED
File without changes
package/src/binary.ts ADDED
@@ -0,0 +1,32 @@
1
+ import { execSync } from 'child_process';
2
+ import * as path from 'path';
3
+ import * as fs from 'fs';
4
+
5
+ export function findBinary(): string {
6
+ // 1. EVALFORGE_BIN env var
7
+ if (process.env.EVALFORGE_BIN) {
8
+ return process.env.EVALFORGE_BIN;
9
+ }
10
+
11
+ // 2. Walk up from this file looking for target/debug/evalforge
12
+ let current = path.dirname(__filename);
13
+ for (let i = 0; i < 6; i++) {
14
+ const candidate = path.join(current, 'target', 'debug', 'evalforge');
15
+ if (fs.existsSync(candidate)) return candidate;
16
+ const release = path.join(current, 'target', 'release', 'evalforge');
17
+ if (fs.existsSync(release)) return release;
18
+ current = path.dirname(current);
19
+ }
20
+
21
+ // 3. On PATH
22
+ try {
23
+ const which = execSync('which evalforge', { encoding: 'utf8' }).trim();
24
+ if (which) return which;
25
+ } catch {}
26
+
27
+ throw new Error(
28
+ 'EvalForge binary not found.\n' +
29
+ 'Option 1: git clone https://github.com/heManKuMAR6/evalforge && cargo build --release\n' +
30
+ 'Option 2: export EVALFORGE_BIN=/path/to/evalforge/binary'
31
+ );
32
+ }
package/src/index.ts ADDED
@@ -0,0 +1,102 @@
1
+ import { spawnSync } from 'child_process';
2
+ import * as fs from 'fs';
3
+ import * as os from 'os';
4
+ import * as path from 'path';
5
+ import { findBinary } from './binary';
6
+ import { EvalResult, RunOptions, Trace } from './types';
7
+
8
+ export { EvalResult, MetricResult, RunOptions, Trace } from './types';
9
+
10
+ function parseOutput(output: string): EvalResult {
11
+ const metrics = [];
12
+ const lines = output.split('\n');
13
+
14
+ let traceId = '';
15
+ let framework = '';
16
+ let overallPassed = true;
17
+
18
+ for (let i = 0; i < lines.length; i++) {
19
+ const line = lines[i].trim();
20
+
21
+ if (line.startsWith('Trace ID:')) traceId = line.split(':')[1].trim();
22
+ if (line.startsWith('Framework:')) framework = line.split(':')[1].trim();
23
+ if (line === 'Overall: FAIL') overallPassed = false;
24
+
25
+ const metricMatch = line.match(/^(\w+)\s+([\d.]+)\s+(PASS|FAIL)$/);
26
+ if (metricMatch) {
27
+ const reason = lines[i + 1]?.trim().startsWith('Reason:')
28
+ ? lines[i + 1].replace('Reason:', '').trim()
29
+ : '';
30
+ metrics.push({
31
+ metric: metricMatch[1],
32
+ score: parseFloat(metricMatch[2]),
33
+ passed: metricMatch[3] === 'PASS',
34
+ reason,
35
+ });
36
+ }
37
+ }
38
+
39
+ return { traceId, framework, metrics, passed: overallPassed };
40
+ }
41
+
42
+ export function run(tracePath: string, options: RunOptions): EvalResult {
43
+ const binary = findBinary();
44
+ const args = [
45
+ 'run',
46
+ '--trace', tracePath,
47
+ '--metrics', options.metrics.join(','),
48
+ '--threshold', String(options.threshold ?? 0.7),
49
+ ];
50
+
51
+ if (options.mock) args.push('--mock');
52
+ if (options.rubric) { args.push('--rubric'); args.push(options.rubric); }
53
+
54
+ const env = { ...process.env };
55
+ if (options.apiKey) env.ANTHROPIC_API_KEY = options.apiKey;
56
+
57
+ const result = spawnSync(binary, args, { encoding: 'utf8', env });
58
+ if (result.error) throw result.error;
59
+
60
+ return parseOutput(result.stdout);
61
+ }
62
+
63
+ export function demo(): EvalResult {
64
+ const trace: Trace = {
65
+ evalforge_version: '0.1',
66
+ trace_id: 'demo-js-001',
67
+ timestamp: new Date().toISOString(),
68
+ metadata: {
69
+ framework: 'openai-agents',
70
+ model: 'gpt-4o',
71
+ agent_name: 'demo-agent',
72
+ duration_ms: 1200,
73
+ total_tokens: 450,
74
+ },
75
+ input: {
76
+ user: 'What is the capital of Australia?',
77
+ system: 'You are a helpful assistant.',
78
+ },
79
+ steps: [
80
+ { step_id: 1, type: 'thought', content: 'I know this — Canberra.' },
81
+ {
82
+ step_id: 2,
83
+ type: 'tool_call',
84
+ tool: 'web_search',
85
+ input: { query: 'capital of Australia' },
86
+ output: { result: 'Canberra is the capital of Australia.' },
87
+ duration_ms: 400,
88
+ },
89
+ ],
90
+ output: { answer: 'The capital of Australia is Canberra.' },
91
+ eval_hints: {
92
+ expected_tools: ['web_search'],
93
+ expected_answer: 'Canberra',
94
+ context_documents: [],
95
+ },
96
+ };
97
+
98
+ const tmp = path.join(os.tmpdir(), `evalforge_demo_${Date.now()}.json`);
99
+ fs.writeFileSync(tmp, JSON.stringify(trace));
100
+
101
+ return run(tmp, { metrics: ['faithfulness'], mock: true });
102
+ }
package/src/types.ts ADDED
@@ -0,0 +1,56 @@
1
+ export interface MetricResult {
2
+ metric: string;
3
+ score: number;
4
+ passed: boolean;
5
+ reason: string;
6
+ }
7
+
8
+ export interface EvalResult {
9
+ traceId: string;
10
+ framework: string;
11
+ metrics: MetricResult[];
12
+ passed: boolean;
13
+ }
14
+
15
+ export interface RunOptions {
16
+ metrics: string[];
17
+ threshold?: number;
18
+ mock?: boolean;
19
+ apiKey?: string;
20
+ rubric?: string;
21
+ }
22
+
23
+ export interface Trace {
24
+ evalforge_version: string;
25
+ trace_id: string;
26
+ timestamp: string;
27
+ metadata: {
28
+ framework: string;
29
+ model: string;
30
+ agent_name: string;
31
+ duration_ms: number;
32
+ total_tokens: number;
33
+ };
34
+ input: {
35
+ user: string;
36
+ system: string;
37
+ };
38
+ steps: Array<{
39
+ step_id: number;
40
+ type: string;
41
+ content?: string;
42
+ tool?: string;
43
+ input?: Record<string, unknown>;
44
+ output?: Record<string, unknown>;
45
+ duration_ms?: number;
46
+ }>;
47
+ output: {
48
+ answer: string;
49
+ finish_reason?: string;
50
+ };
51
+ eval_hints: {
52
+ expected_tools: string[];
53
+ expected_answer: string | null;
54
+ context_documents: string[];
55
+ };
56
+ }
@@ -0,0 +1,27 @@
1
+ const { test } = require('node:test');
2
+ const assert = require('node:assert');
3
+ const path = require('path');
4
+
5
+ // We test the compiled JS
6
+ const evalforge = require('../dist/index.js');
7
+
8
+ test('demo() returns EvalResult with passed=true', () => {
9
+ process.env.EVALFORGE_BIN = path.resolve(
10
+ __dirname, '../../../target/debug/evalforge'
11
+ );
12
+ const result = evalforge.demo();
13
+ assert.strictEqual(result.passed, true);
14
+ assert.strictEqual(result.metrics.length > 0, true);
15
+ assert.strictEqual(result.metrics[0].score, 0.91);
16
+ });
17
+
18
+ test('EvalResult has correct shape', () => {
19
+ process.env.EVALFORGE_BIN = path.resolve(
20
+ __dirname, '../../../target/debug/evalforge'
21
+ );
22
+ const result = evalforge.demo();
23
+ assert.ok('traceId' in result);
24
+ assert.ok('framework' in result);
25
+ assert.ok('metrics' in result);
26
+ assert.ok('passed' in result);
27
+ });
package/tsconfig.json ADDED
@@ -0,0 +1,16 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2020",
4
+ "module": "commonjs",
5
+ "lib": ["ES2020"],
6
+ "outDir": "./dist",
7
+ "rootDir": "./src",
8
+ "strict": true,
9
+ "esModuleInterop": true,
10
+ "declaration": true,
11
+ "declarationMap": true,
12
+ "sourceMap": true
13
+ },
14
+ "include": ["src/**/*"],
15
+ "exclude": ["node_modules", "dist", "tests"]
16
+ }