evaldog 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -0
- package/index.js +219 -0
- package/package.json +17 -0
package/README.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# evaldog
|
|
2
|
+
|
|
3
|
+
Run LLM / prompt evals locally — **deterministic grading, zero LLM tokens.**
|
|
4
|
+
|
|
5
|
+
Built for CI gates and AI agents that need a fast pass/fail on prompt/RAG outputs
|
|
6
|
+
without burning context window.
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
npx evaldog run cases.csv
|
|
10
|
+
npx evaldog run cases.csv --min 80 # exit 1 if score < 80 (CI gate)
|
|
11
|
+
npx evaldog run cases.csv --json # machine-readable (for agents)
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## File formats
|
|
15
|
+
|
|
16
|
+
**CSV** — `name,output,expected,assert`
|
|
17
|
+
```csv
|
|
18
|
+
name,output,expected,assert
|
|
19
|
+
Password reset,Click the reset link.,reset link,contains
|
|
20
|
+
JSON shape,"{""ok"":true}",,is-json
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
**JSON / YAML**
|
|
24
|
+
```yaml
|
|
25
|
+
cases:
|
|
26
|
+
- name: greeting
|
|
27
|
+
output: "Sure! Happy to help."
|
|
28
|
+
assert:
|
|
29
|
+
- { type: not-empty }
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Assertions: `contains · icontains · equals · regex · is-json · not-empty`
|
|
33
|
+
(default `contains` when an `expected` value is present, else `not-empty`).
|
|
34
|
+
|
|
35
|
+
## Why agents like it
|
|
36
|
+
|
|
37
|
+
An agent can grade 200 outputs with one shell call and read back a single number
|
|
38
|
+
+ exit code — instead of streaming every case through the model.
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
evaldog run outputs.csv --json --min 90 || echo "regression!"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Hosted dashboard + scheduled drift alerts: **https://evaldog.com**
|
|
45
|
+
|
|
46
|
+
MIT © The Testing Academy
|
package/index.js
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* EvalDog CLI — run LLM/prompt evals locally with deterministic grading.
|
|
6
|
+
* Zero LLM tokens: it grades outputs you already have against simple assertions.
|
|
7
|
+
* Perfect for CI gates and AI agents that need a pass/fail without burning context.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const fs = require('fs');
|
|
11
|
+
const path = require('path');
|
|
12
|
+
|
|
13
|
+
let yaml = null;
|
|
14
|
+
let Papa = null;
|
|
15
|
+
try {
|
|
16
|
+
yaml = require('js-yaml');
|
|
17
|
+
} catch (_) {
|
|
18
|
+
/* optional until a yaml file is used */
|
|
19
|
+
}
|
|
20
|
+
try {
|
|
21
|
+
Papa = require('papaparse');
|
|
22
|
+
} catch (_) {
|
|
23
|
+
/* optional until a csv file is used */
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// ---------------------------------------------------------------- grading ----
|
|
27
|
+
const ALIASES = {
|
|
28
|
+
contains: 'contains',
|
|
29
|
+
icontains: 'icontains',
|
|
30
|
+
'i-contains': 'icontains',
|
|
31
|
+
equals: 'equals',
|
|
32
|
+
equal: 'equals',
|
|
33
|
+
exact: 'equals',
|
|
34
|
+
regex: 'regex',
|
|
35
|
+
matches: 'regex',
|
|
36
|
+
'is-json': 'is-json',
|
|
37
|
+
json: 'is-json',
|
|
38
|
+
'not-empty': 'not-empty',
|
|
39
|
+
notempty: 'not-empty',
|
|
40
|
+
};
|
|
41
|
+
const normType = (t) => ALIASES[String(t).toLowerCase().trim()] || 'contains';
|
|
42
|
+
const truncate = (s, n) => {
|
|
43
|
+
s = String(s);
|
|
44
|
+
return s.length > n ? s.slice(0, n) + '…' : s;
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
function runAssert(output, a, expected) {
|
|
48
|
+
const out = output == null ? '' : String(output);
|
|
49
|
+
const val = String(a.value != null ? a.value : expected != null ? expected : '');
|
|
50
|
+
switch (a.type) {
|
|
51
|
+
case 'equals':
|
|
52
|
+
return { ok: out.trim() === val.trim(), label: `equals "${truncate(val, 24)}"` };
|
|
53
|
+
case 'contains':
|
|
54
|
+
return { ok: out.includes(val), label: `contains "${truncate(val, 24)}"` };
|
|
55
|
+
case 'icontains':
|
|
56
|
+
return { ok: out.toLowerCase().includes(val.toLowerCase()), label: `icontains "${truncate(val, 24)}"` };
|
|
57
|
+
case 'regex':
|
|
58
|
+
try {
|
|
59
|
+
return { ok: new RegExp(val).test(out), label: `matches /${truncate(val, 20)}/` };
|
|
60
|
+
} catch (_) {
|
|
61
|
+
return { ok: false, label: 'invalid regex' };
|
|
62
|
+
}
|
|
63
|
+
case 'is-json':
|
|
64
|
+
try {
|
|
65
|
+
JSON.parse(out);
|
|
66
|
+
return { ok: true, label: 'valid JSON' };
|
|
67
|
+
} catch (_) {
|
|
68
|
+
return { ok: false, label: 'valid JSON' };
|
|
69
|
+
}
|
|
70
|
+
case 'not-empty':
|
|
71
|
+
return { ok: out.trim().length > 0, label: 'not empty' };
|
|
72
|
+
default:
|
|
73
|
+
return { ok: false, label: 'unknown assert' };
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function grade(cases) {
|
|
78
|
+
const results = cases.map((c, i) => {
|
|
79
|
+
const asserts = c.asserts && c.asserts.length ? c.asserts : [{ type: 'not-empty' }];
|
|
80
|
+
const checks = asserts.map((a) => runAssert(c.output, a, c.expected));
|
|
81
|
+
return { name: c.name || `Case ${i + 1}`, output: c.output || '', passed: checks.every((x) => x.ok), checks };
|
|
82
|
+
});
|
|
83
|
+
const passed = results.filter((r) => r.passed).length;
|
|
84
|
+
const total = results.length;
|
|
85
|
+
return { score: total ? Math.round((passed / total) * 100) : 0, total, passed, failed: total - passed, cases: results };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ---------------------------------------------------------------- parsing ----
|
|
89
|
+
function parseCsv(text) {
|
|
90
|
+
if (!Papa) throw new Error('Install "papaparse" to read CSV files');
|
|
91
|
+
const { data } = Papa.parse(text, { header: true, skipEmptyLines: true });
|
|
92
|
+
return data.map((row, i) => {
|
|
93
|
+
const low = {};
|
|
94
|
+
for (const k of Object.keys(row)) low[k.toLowerCase().trim()] = row[k];
|
|
95
|
+
const output = low.output != null ? low.output : low.actual != null ? low.actual : low.response || '';
|
|
96
|
+
const expected = low.expected != null ? low.expected : low.expected_output != null ? low.expected_output : low.gold;
|
|
97
|
+
const at = low.assert ? normType(low.assert) : expected ? 'contains' : 'not-empty';
|
|
98
|
+
return { name: low.name || low.id || low.test || `Case ${i + 1}`, output, expected, asserts: [{ type: at, value: expected }] };
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function parseStructured(obj) {
|
|
103
|
+
const arr = Array.isArray(obj) ? obj : (obj && (obj.cases || obj.tests)) || [];
|
|
104
|
+
return arr.map((t, i) => {
|
|
105
|
+
const vars = t.vars || {};
|
|
106
|
+
const output = t.output != null ? t.output : t.actual != null ? t.actual : t.response != null ? t.response : vars.output || '';
|
|
107
|
+
const expected = t.expected != null ? t.expected : t.gold;
|
|
108
|
+
let asserts = [];
|
|
109
|
+
const raw = t.assert || t.asserts;
|
|
110
|
+
if (Array.isArray(raw)) asserts = raw.filter((a) => a && a.type).map((a) => ({ type: normType(a.type), value: a.value != null ? a.value : a.expected }));
|
|
111
|
+
if (!asserts.length) asserts = [{ type: expected ? 'contains' : 'not-empty', value: expected }];
|
|
112
|
+
return { name: t.name || t.description || t.id || `Case ${i + 1}`, output, expected, asserts };
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function parseFile(file) {
|
|
117
|
+
const text = fs.readFileSync(file, 'utf8');
|
|
118
|
+
const ext = path.extname(file).toLowerCase().replace('.', '');
|
|
119
|
+
if (ext === 'csv') return parseCsv(text);
|
|
120
|
+
if (ext === 'json') return parseStructured(JSON.parse(text));
|
|
121
|
+
if (ext === 'yaml' || ext === 'yml') {
|
|
122
|
+
if (!yaml) throw new Error('Install "js-yaml" to read YAML files');
|
|
123
|
+
return parseStructured(yaml.load(text));
|
|
124
|
+
}
|
|
125
|
+
try {
|
|
126
|
+
return parseStructured(JSON.parse(text));
|
|
127
|
+
} catch (_) {
|
|
128
|
+
/* not json */
|
|
129
|
+
}
|
|
130
|
+
if (yaml) {
|
|
131
|
+
try {
|
|
132
|
+
const y = yaml.load(text);
|
|
133
|
+
if (y && typeof y === 'object') return parseStructured(y);
|
|
134
|
+
} catch (_) {
|
|
135
|
+
/* not yaml */
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return parseCsv(text);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// -------------------------------------------------------------------- cli ----
|
|
142
|
+
const C = { g: '\x1b[32m', r: '\x1b[31m', y: '\x1b[33m', d: '\x1b[2m', b: '\x1b[1m', x: '\x1b[0m' };
|
|
143
|
+
const color = (s, c) => (process.stdout.isTTY ? c + s + C.x : s);
|
|
144
|
+
|
|
145
|
+
const args = process.argv.slice(2);
|
|
146
|
+
const has = (f) => args.includes(f);
|
|
147
|
+
function flagVal(name, def) {
|
|
148
|
+
const i = args.indexOf(name);
|
|
149
|
+
if (i >= 0 && args[i + 1] && !args[i + 1].startsWith('--')) return args[i + 1];
|
|
150
|
+
return def;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
function help() {
|
|
154
|
+
console.log(`
|
|
155
|
+
${color('EvalDog', C.b)} — run LLM/prompt evals locally. Deterministic. Zero tokens.
|
|
156
|
+
|
|
157
|
+
Usage:
|
|
158
|
+
npx evaldog run <file> Grade a .csv / .json / .yaml eval file
|
|
159
|
+
npx evaldog run <file> --min 80 Exit 1 if score < 80 (CI / agent gate)
|
|
160
|
+
npx evaldog run <file> --json Machine-readable output (for AI agents)
|
|
161
|
+
npx evaldog run <file> --quiet Print only the summary line
|
|
162
|
+
|
|
163
|
+
Assertions: contains | icontains | equals | regex | is-json | not-empty
|
|
164
|
+
CSV columns: name,output,expected,assert
|
|
165
|
+
Docs: https://evaldog.com/quickstart
|
|
166
|
+
`);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const cmd = args[0];
|
|
170
|
+
if (!cmd || cmd === '-h' || cmd === '--help' || cmd === 'help') {
|
|
171
|
+
help();
|
|
172
|
+
process.exit(0);
|
|
173
|
+
}
|
|
174
|
+
if (cmd !== 'run') {
|
|
175
|
+
console.error('Unknown command: ' + cmd);
|
|
176
|
+
help();
|
|
177
|
+
process.exit(2);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const file = args[1];
|
|
181
|
+
if (!file || file.startsWith('--')) {
|
|
182
|
+
console.error('Provide a file: evaldog run <file>');
|
|
183
|
+
process.exit(2);
|
|
184
|
+
}
|
|
185
|
+
if (!fs.existsSync(file)) {
|
|
186
|
+
console.error('File not found: ' + file);
|
|
187
|
+
process.exit(2);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const min = parseInt(flagVal('--min', '0'), 10) || 0;
|
|
191
|
+
let res;
|
|
192
|
+
try {
|
|
193
|
+
const cases = parseFile(file);
|
|
194
|
+
if (!cases.length) {
|
|
195
|
+
console.error('No test cases found in ' + file);
|
|
196
|
+
process.exit(2);
|
|
197
|
+
}
|
|
198
|
+
res = grade(cases);
|
|
199
|
+
} catch (e) {
|
|
200
|
+
console.error('Error: ' + e.message);
|
|
201
|
+
process.exit(2);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (has('--json')) {
|
|
205
|
+
console.log(JSON.stringify(res, null, 2));
|
|
206
|
+
process.exit(res.score >= min ? 0 : 1);
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
if (!has('--quiet')) {
|
|
210
|
+
for (const c of res.cases) {
|
|
211
|
+
console.log(`${c.passed ? color('✓', C.g) : color('✗', C.r)} ${c.name}`);
|
|
212
|
+
if (!c.passed) for (const ck of c.checks) if (!ck.ok) console.log(color(` ✗ ${ck.label}`, C.d));
|
|
213
|
+
}
|
|
214
|
+
console.log('');
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const col = res.score >= 80 ? C.g : res.score >= 50 ? C.y : C.r;
|
|
218
|
+
console.log(`${color(res.score + '%', col + C.b)} ${res.passed}/${res.total} passed` + (min ? ` (gate ${min}%)` : ''));
|
|
219
|
+
process.exit(res.score >= min ? 0 : 1);
|
package/package.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "evaldog",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Run LLM/prompt evals locally — deterministic grading for CI and AI agents. Zero tokens.",
|
|
5
|
+
"bin": { "evaldog": "index.js" },
|
|
6
|
+
"type": "commonjs",
|
|
7
|
+
"files": ["index.js", "README.md"],
|
|
8
|
+
"keywords": ["llm", "eval", "evaluation", "prompt", "testing", "rag", "ci", "ai-agent", "llmops"],
|
|
9
|
+
"homepage": "https://evaldog.com",
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"author": "The Testing Academy",
|
|
12
|
+
"engines": { "node": ">=18" },
|
|
13
|
+
"dependencies": {
|
|
14
|
+
"js-yaml": "^4.1.0",
|
|
15
|
+
"papaparse": "^5.5.2"
|
|
16
|
+
}
|
|
17
|
+
}
|