evaldog 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +46 -0
  2. package/index.js +219 -0
  3. package/package.json +17 -0
package/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # evaldog
2
+
3
+ Run LLM / prompt evals locally — **deterministic grading, zero LLM tokens.**
4
+
5
+ Built for CI gates and AI agents that need a fast pass/fail on prompt/RAG outputs
6
+ without burning context window.
7
+
8
+ ```bash
9
+ npx evaldog run cases.csv
10
+ npx evaldog run cases.csv --min 80 # exit 1 if score < 80 (CI gate)
11
+ npx evaldog run cases.csv --json # machine-readable (for agents)
12
+ ```
13
+
14
+ ## File formats
15
+
16
+ **CSV** — `name,output,expected,assert`
17
+ ```csv
18
+ name,output,expected,assert
19
+ Password reset,Click the reset link.,reset link,contains
20
+ JSON shape,"{""ok"":true}",,is-json
21
+ ```
22
+
23
+ **JSON / YAML**
24
+ ```yaml
25
+ cases:
26
+ - name: greeting
27
+ output: "Sure! Happy to help."
28
+ assert:
29
+ - { type: not-empty }
30
+ ```
31
+
32
+ Assertions: `contains · icontains · equals · regex · is-json · not-empty`
33
+ (default `contains` when an `expected` value is present, else `not-empty`).
34
+
35
+ ## Why agents like it
36
+
37
+ An agent can grade 200 outputs with one shell call and read back a single number
38
+ + exit code — instead of streaming every case through the model.
39
+
40
+ ```bash
41
+ evaldog run outputs.csv --json --min 90 || echo "regression!"
42
+ ```
43
+
44
+ Hosted dashboard + scheduled drift alerts: **https://evaldog.com**
45
+
46
+ MIT © The Testing Academy
package/index.js ADDED
@@ -0,0 +1,219 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ /**
5
+ * EvalDog CLI — run LLM/prompt evals locally with deterministic grading.
6
+ * Zero LLM tokens: it grades outputs you already have against simple assertions.
7
+ * Perfect for CI gates and AI agents that need a pass/fail without burning context.
8
+ */
9
+
10
+ const fs = require('fs');
11
+ const path = require('path');
12
+
13
+ let yaml = null;
14
+ let Papa = null;
15
+ try {
16
+ yaml = require('js-yaml');
17
+ } catch (_) {
18
+ /* optional until a yaml file is used */
19
+ }
20
+ try {
21
+ Papa = require('papaparse');
22
+ } catch (_) {
23
+ /* optional until a csv file is used */
24
+ }
25
+
26
+ // ---------------------------------------------------------------- grading ----
27
+ const ALIASES = {
28
+ contains: 'contains',
29
+ icontains: 'icontains',
30
+ 'i-contains': 'icontains',
31
+ equals: 'equals',
32
+ equal: 'equals',
33
+ exact: 'equals',
34
+ regex: 'regex',
35
+ matches: 'regex',
36
+ 'is-json': 'is-json',
37
+ json: 'is-json',
38
+ 'not-empty': 'not-empty',
39
+ notempty: 'not-empty',
40
+ };
41
+ const normType = (t) => ALIASES[String(t).toLowerCase().trim()] || 'contains';
42
+ const truncate = (s, n) => {
43
+ s = String(s);
44
+ return s.length > n ? s.slice(0, n) + '…' : s;
45
+ };
46
+
47
+ function runAssert(output, a, expected) {
48
+ const out = output == null ? '' : String(output);
49
+ const val = String(a.value != null ? a.value : expected != null ? expected : '');
50
+ switch (a.type) {
51
+ case 'equals':
52
+ return { ok: out.trim() === val.trim(), label: `equals "${truncate(val, 24)}"` };
53
+ case 'contains':
54
+ return { ok: out.includes(val), label: `contains "${truncate(val, 24)}"` };
55
+ case 'icontains':
56
+ return { ok: out.toLowerCase().includes(val.toLowerCase()), label: `icontains "${truncate(val, 24)}"` };
57
+ case 'regex':
58
+ try {
59
+ return { ok: new RegExp(val).test(out), label: `matches /${truncate(val, 20)}/` };
60
+ } catch (_) {
61
+ return { ok: false, label: 'invalid regex' };
62
+ }
63
+ case 'is-json':
64
+ try {
65
+ JSON.parse(out);
66
+ return { ok: true, label: 'valid JSON' };
67
+ } catch (_) {
68
+ return { ok: false, label: 'valid JSON' };
69
+ }
70
+ case 'not-empty':
71
+ return { ok: out.trim().length > 0, label: 'not empty' };
72
+ default:
73
+ return { ok: false, label: 'unknown assert' };
74
+ }
75
+ }
76
+
77
+ function grade(cases) {
78
+ const results = cases.map((c, i) => {
79
+ const asserts = c.asserts && c.asserts.length ? c.asserts : [{ type: 'not-empty' }];
80
+ const checks = asserts.map((a) => runAssert(c.output, a, c.expected));
81
+ return { name: c.name || `Case ${i + 1}`, output: c.output || '', passed: checks.every((x) => x.ok), checks };
82
+ });
83
+ const passed = results.filter((r) => r.passed).length;
84
+ const total = results.length;
85
+ return { score: total ? Math.round((passed / total) * 100) : 0, total, passed, failed: total - passed, cases: results };
86
+ }
87
+
88
+ // ---------------------------------------------------------------- parsing ----
89
+ function parseCsv(text) {
90
+ if (!Papa) throw new Error('Install "papaparse" to read CSV files');
91
+ const { data } = Papa.parse(text, { header: true, skipEmptyLines: true });
92
+ return data.map((row, i) => {
93
+ const low = {};
94
+ for (const k of Object.keys(row)) low[k.toLowerCase().trim()] = row[k];
95
+ const output = low.output != null ? low.output : low.actual != null ? low.actual : low.response || '';
96
+ const expected = low.expected != null ? low.expected : low.expected_output != null ? low.expected_output : low.gold;
97
+ const at = low.assert ? normType(low.assert) : expected ? 'contains' : 'not-empty';
98
+ return { name: low.name || low.id || low.test || `Case ${i + 1}`, output, expected, asserts: [{ type: at, value: expected }] };
99
+ });
100
+ }
101
+
102
+ function parseStructured(obj) {
103
+ const arr = Array.isArray(obj) ? obj : (obj && (obj.cases || obj.tests)) || [];
104
+ return arr.map((t, i) => {
105
+ const vars = t.vars || {};
106
+ const output = t.output != null ? t.output : t.actual != null ? t.actual : t.response != null ? t.response : vars.output || '';
107
+ const expected = t.expected != null ? t.expected : t.gold;
108
+ let asserts = [];
109
+ const raw = t.assert || t.asserts;
110
+ if (Array.isArray(raw)) asserts = raw.filter((a) => a && a.type).map((a) => ({ type: normType(a.type), value: a.value != null ? a.value : a.expected }));
111
+ if (!asserts.length) asserts = [{ type: expected ? 'contains' : 'not-empty', value: expected }];
112
+ return { name: t.name || t.description || t.id || `Case ${i + 1}`, output, expected, asserts };
113
+ });
114
+ }
115
+
116
+ function parseFile(file) {
117
+ const text = fs.readFileSync(file, 'utf8');
118
+ const ext = path.extname(file).toLowerCase().replace('.', '');
119
+ if (ext === 'csv') return parseCsv(text);
120
+ if (ext === 'json') return parseStructured(JSON.parse(text));
121
+ if (ext === 'yaml' || ext === 'yml') {
122
+ if (!yaml) throw new Error('Install "js-yaml" to read YAML files');
123
+ return parseStructured(yaml.load(text));
124
+ }
125
+ try {
126
+ return parseStructured(JSON.parse(text));
127
+ } catch (_) {
128
+ /* not json */
129
+ }
130
+ if (yaml) {
131
+ try {
132
+ const y = yaml.load(text);
133
+ if (y && typeof y === 'object') return parseStructured(y);
134
+ } catch (_) {
135
+ /* not yaml */
136
+ }
137
+ }
138
+ return parseCsv(text);
139
+ }
140
+
141
+ // -------------------------------------------------------------------- cli ----
142
+ const C = { g: '\x1b[32m', r: '\x1b[31m', y: '\x1b[33m', d: '\x1b[2m', b: '\x1b[1m', x: '\x1b[0m' };
143
+ const color = (s, c) => (process.stdout.isTTY ? c + s + C.x : s);
144
+
145
+ const args = process.argv.slice(2);
146
+ const has = (f) => args.includes(f);
147
+ function flagVal(name, def) {
148
+ const i = args.indexOf(name);
149
+ if (i >= 0 && args[i + 1] && !args[i + 1].startsWith('--')) return args[i + 1];
150
+ return def;
151
+ }
152
+
153
+ function help() {
154
+ console.log(`
155
+ ${color('EvalDog', C.b)} — run LLM/prompt evals locally. Deterministic. Zero tokens.
156
+
157
+ Usage:
158
+ npx evaldog run <file> Grade a .csv / .json / .yaml eval file
159
+ npx evaldog run <file> --min 80 Exit 1 if score < 80 (CI / agent gate)
160
+ npx evaldog run <file> --json Machine-readable output (for AI agents)
161
+ npx evaldog run <file> --quiet Print only the summary line
162
+
163
+ Assertions: contains | icontains | equals | regex | is-json | not-empty
164
+ CSV columns: name,output,expected,assert
165
+ Docs: https://evaldog.com/quickstart
166
+ `);
167
+ }
168
+
169
+ const cmd = args[0];
170
+ if (!cmd || cmd === '-h' || cmd === '--help' || cmd === 'help') {
171
+ help();
172
+ process.exit(0);
173
+ }
174
+ if (cmd !== 'run') {
175
+ console.error('Unknown command: ' + cmd);
176
+ help();
177
+ process.exit(2);
178
+ }
179
+
180
+ const file = args[1];
181
+ if (!file || file.startsWith('--')) {
182
+ console.error('Provide a file: evaldog run <file>');
183
+ process.exit(2);
184
+ }
185
+ if (!fs.existsSync(file)) {
186
+ console.error('File not found: ' + file);
187
+ process.exit(2);
188
+ }
189
+
190
+ const min = parseInt(flagVal('--min', '0'), 10) || 0;
191
+ let res;
192
+ try {
193
+ const cases = parseFile(file);
194
+ if (!cases.length) {
195
+ console.error('No test cases found in ' + file);
196
+ process.exit(2);
197
+ }
198
+ res = grade(cases);
199
+ } catch (e) {
200
+ console.error('Error: ' + e.message);
201
+ process.exit(2);
202
+ }
203
+
204
+ if (has('--json')) {
205
+ console.log(JSON.stringify(res, null, 2));
206
+ process.exit(res.score >= min ? 0 : 1);
207
+ }
208
+
209
+ if (!has('--quiet')) {
210
+ for (const c of res.cases) {
211
+ console.log(`${c.passed ? color('✓', C.g) : color('✗', C.r)} ${c.name}`);
212
+ if (!c.passed) for (const ck of c.checks) if (!ck.ok) console.log(color(` ✗ ${ck.label}`, C.d));
213
+ }
214
+ console.log('');
215
+ }
216
+
217
+ const col = res.score >= 80 ? C.g : res.score >= 50 ? C.y : C.r;
218
+ console.log(`${color(res.score + '%', col + C.b)} ${res.passed}/${res.total} passed` + (min ? ` (gate ${min}%)` : ''));
219
+ process.exit(res.score >= min ? 0 : 1);
package/package.json ADDED
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "evaldog",
3
+ "version": "0.1.0",
4
+ "description": "Run LLM/prompt evals locally — deterministic grading for CI and AI agents. Zero tokens.",
5
+ "bin": { "evaldog": "index.js" },
6
+ "type": "commonjs",
7
+ "files": ["index.js", "README.md"],
8
+ "keywords": ["llm", "eval", "evaluation", "prompt", "testing", "rag", "ci", "ai-agent", "llmops"],
9
+ "homepage": "https://evaldog.com",
10
+ "license": "MIT",
11
+ "author": "The Testing Academy",
12
+ "engines": { "node": ">=18" },
13
+ "dependencies": {
14
+ "js-yaml": "^4.1.0",
15
+ "papaparse": "^5.5.2"
16
+ }
17
+ }