@prompd/test 0.5.0-beta.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/EvaluatorEngine.d.ts +32 -0
- package/dist/EvaluatorEngine.d.ts.map +1 -0
- package/dist/EvaluatorEngine.js +97 -0
- package/dist/TestDiscovery.d.ts +28 -0
- package/dist/TestDiscovery.d.ts.map +1 -0
- package/dist/TestDiscovery.js +137 -0
- package/dist/TestParser.d.ts +25 -0
- package/dist/TestParser.d.ts.map +1 -0
- package/dist/TestParser.js +187 -0
- package/dist/TestRunner.d.ts +57 -0
- package/dist/TestRunner.d.ts.map +1 -0
- package/dist/TestRunner.js +463 -0
- package/dist/cli-types.d.ts +62 -0
- package/dist/cli-types.d.ts.map +1 -0
- package/dist/cli-types.js +6 -0
- package/dist/evaluators/NlpEvaluator.d.ts +30 -0
- package/dist/evaluators/NlpEvaluator.d.ts.map +1 -0
- package/dist/evaluators/NlpEvaluator.js +183 -0
- package/dist/evaluators/PrmdEvaluator.d.ts +42 -0
- package/dist/evaluators/PrmdEvaluator.d.ts.map +1 -0
- package/dist/evaluators/PrmdEvaluator.js +265 -0
- package/dist/evaluators/ScriptEvaluator.d.ts +19 -0
- package/dist/evaluators/ScriptEvaluator.d.ts.map +1 -0
- package/dist/evaluators/ScriptEvaluator.js +163 -0
- package/dist/evaluators/types.d.ts +19 -0
- package/dist/evaluators/types.d.ts.map +1 -0
- package/dist/evaluators/types.js +5 -0
- package/dist/index.d.ts +25 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +33 -0
- package/dist/reporters/ConsoleReporter.d.ts +17 -0
- package/dist/reporters/ConsoleReporter.d.ts.map +1 -0
- package/dist/reporters/ConsoleReporter.js +85 -0
- package/dist/reporters/JsonReporter.d.ts +11 -0
- package/dist/reporters/JsonReporter.d.ts.map +1 -0
- package/dist/reporters/JsonReporter.js +18 -0
- package/dist/reporters/JunitReporter.d.ts +15 -0
- package/dist/reporters/JunitReporter.d.ts.map +1 -0
- package/dist/reporters/JunitReporter.js +89 -0
- package/dist/reporters/types.d.ts +8 -0
- package/dist/reporters/types.d.ts.map +1 -0
- package/dist/reporters/types.js +5 -0
- package/dist/types.d.ts +119 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/package.json +34 -0
- package/src/EvaluatorEngine.ts +130 -0
- package/src/TestDiscovery.ts +133 -0
- package/src/TestParser.ts +235 -0
- package/src/TestRunner.ts +516 -0
- package/src/cli-types.ts +92 -0
- package/src/evaluators/NlpEvaluator.ts +240 -0
- package/src/evaluators/PrmdEvaluator.ts +284 -0
- package/src/evaluators/ScriptEvaluator.ts +152 -0
- package/src/evaluators/types.ts +24 -0
- package/src/index.ts +76 -0
- package/src/reporters/ConsoleReporter.ts +100 -0
- package/src/reporters/JsonReporter.ts +21 -0
- package/src/reporters/JunitReporter.ts +113 -0
- package/src/reporters/types.ts +9 -0
- package/src/types.ts +140 -0
- package/tsconfig.json +20 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* NLP Evaluator - local, fast, free, deterministic assertions.
|
|
4
|
+
*
|
|
5
|
+
* Checks: contains, not_contains, matches, max_tokens, min_tokens, starts_with, ends_with
|
|
6
|
+
*/
|
|
7
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
8
|
+
exports.NlpEvaluator = void 0;
|
|
9
|
+
class NlpEvaluator {
|
|
10
|
+
constructor() {
|
|
11
|
+
this.type = 'nlp';
|
|
12
|
+
}
|
|
13
|
+
async evaluate(assertion, context) {
|
|
14
|
+
const start = Date.now();
|
|
15
|
+
const check = assertion.check;
|
|
16
|
+
const target = assertion.evaluate || 'response';
|
|
17
|
+
try {
|
|
18
|
+
const text = this.resolveTarget(target, context);
|
|
19
|
+
const targetLabel = target === 'both' ? 'Prompt+Response' : target === 'prompt' ? 'Prompt' : 'Output';
|
|
20
|
+
const result = this.runCheck(check, assertion.value, text, targetLabel);
|
|
21
|
+
return {
|
|
22
|
+
evaluator: 'nlp',
|
|
23
|
+
check,
|
|
24
|
+
status: result.pass ? 'pass' : 'fail',
|
|
25
|
+
reason: result.reason,
|
|
26
|
+
duration: Date.now() - start,
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
catch (err) {
|
|
30
|
+
return {
|
|
31
|
+
evaluator: 'nlp',
|
|
32
|
+
check,
|
|
33
|
+
status: 'error',
|
|
34
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
35
|
+
duration: Date.now() - start,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
resolveTarget(target, context) {
|
|
40
|
+
switch (target) {
|
|
41
|
+
case 'prompt': return context.prompt;
|
|
42
|
+
case 'both': return `${context.prompt}\n\n${context.response}`;
|
|
43
|
+
case 'response':
|
|
44
|
+
default: return context.response;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
runCheck(check, value, output, label = 'Output') {
|
|
48
|
+
switch (check) {
|
|
49
|
+
case 'contains':
|
|
50
|
+
return this.checkContains(value, output, label);
|
|
51
|
+
case 'not_contains':
|
|
52
|
+
return this.checkNotContains(value, output, label);
|
|
53
|
+
case 'matches':
|
|
54
|
+
return this.checkMatches(value, output, label);
|
|
55
|
+
case 'max_tokens':
|
|
56
|
+
return this.checkMaxTokens(value, output);
|
|
57
|
+
case 'min_tokens':
|
|
58
|
+
return this.checkMinTokens(value, output);
|
|
59
|
+
case 'max_words':
|
|
60
|
+
return this.checkMaxWords(value, output);
|
|
61
|
+
case 'min_words':
|
|
62
|
+
return this.checkMinWords(value, output);
|
|
63
|
+
case 'starts_with':
|
|
64
|
+
return this.checkStartsWith(value, output, label);
|
|
65
|
+
case 'ends_with':
|
|
66
|
+
return this.checkEndsWith(value, output, label);
|
|
67
|
+
default:
|
|
68
|
+
return { pass: false, reason: `Unknown NLP check: ${check}` };
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
checkContains(value, output, label) {
|
|
72
|
+
const values = this.toStringArray(value);
|
|
73
|
+
const lower = output.toLowerCase();
|
|
74
|
+
const missing = values.filter(v => !lower.includes(v.toLowerCase()));
|
|
75
|
+
if (missing.length === 0) {
|
|
76
|
+
return { pass: true, reason: `${label} contains all expected values` };
|
|
77
|
+
}
|
|
78
|
+
return {
|
|
79
|
+
pass: false,
|
|
80
|
+
reason: `${label} missing: ${missing.map(v => `"${v}"`).join(', ')}`,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
checkNotContains(value, output, label) {
|
|
84
|
+
const values = this.toStringArray(value);
|
|
85
|
+
const lower = output.toLowerCase();
|
|
86
|
+
const found = values.filter(v => lower.includes(v.toLowerCase()));
|
|
87
|
+
if (found.length === 0) {
|
|
88
|
+
return { pass: true, reason: `${label} does not contain any excluded values` };
|
|
89
|
+
}
|
|
90
|
+
return {
|
|
91
|
+
pass: false,
|
|
92
|
+
reason: `${label} contains excluded values: ${found.map(v => `"${v}"`).join(', ')}`,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
checkMatches(value, output, label) {
|
|
96
|
+
if (typeof value !== 'string') {
|
|
97
|
+
return { pass: false, reason: '"matches" check requires a string regex pattern' };
|
|
98
|
+
}
|
|
99
|
+
const regex = new RegExp(value);
|
|
100
|
+
if (regex.test(output)) {
|
|
101
|
+
return { pass: true, reason: `${label} matches pattern /${value}/` };
|
|
102
|
+
}
|
|
103
|
+
return { pass: false, reason: `${label} does not match pattern /${value}/` };
|
|
104
|
+
}
|
|
105
|
+
checkMaxTokens(value, output) {
|
|
106
|
+
if (typeof value !== 'number') {
|
|
107
|
+
return { pass: false, reason: '"max_tokens" check requires a numeric value' };
|
|
108
|
+
}
|
|
109
|
+
const tokenCount = this.estimateTokens(output);
|
|
110
|
+
if (tokenCount <= value) {
|
|
111
|
+
return { pass: true, reason: `Token count ${tokenCount} <= ${value}` };
|
|
112
|
+
}
|
|
113
|
+
return { pass: false, reason: `Token count ${tokenCount} exceeds max ${value}` };
|
|
114
|
+
}
|
|
115
|
+
checkMinTokens(value, output) {
|
|
116
|
+
if (typeof value !== 'number') {
|
|
117
|
+
return { pass: false, reason: '"min_tokens" check requires a numeric value' };
|
|
118
|
+
}
|
|
119
|
+
const tokenCount = this.estimateTokens(output);
|
|
120
|
+
if (tokenCount >= value) {
|
|
121
|
+
return { pass: true, reason: `Token count ${tokenCount} >= ${value}` };
|
|
122
|
+
}
|
|
123
|
+
return { pass: false, reason: `Token count ${tokenCount} below min ${value}` };
|
|
124
|
+
}
|
|
125
|
+
checkStartsWith(value, output, label) {
|
|
126
|
+
if (typeof value !== 'string') {
|
|
127
|
+
return { pass: false, reason: '"starts_with" check requires a string value' };
|
|
128
|
+
}
|
|
129
|
+
const trimmed = output.trimStart();
|
|
130
|
+
if (trimmed.toLowerCase().startsWith(value.toLowerCase())) {
|
|
131
|
+
return { pass: true, reason: `${label} starts with "${value}"` };
|
|
132
|
+
}
|
|
133
|
+
return { pass: false, reason: `${label} does not start with "${value}"` };
|
|
134
|
+
}
|
|
135
|
+
checkEndsWith(value, output, label) {
|
|
136
|
+
if (typeof value !== 'string') {
|
|
137
|
+
return { pass: false, reason: '"ends_with" check requires a string value' };
|
|
138
|
+
}
|
|
139
|
+
const trimmed = output.trimEnd();
|
|
140
|
+
if (trimmed.toLowerCase().endsWith(value.toLowerCase())) {
|
|
141
|
+
return { pass: true, reason: `${label} ends with "${value}"` };
|
|
142
|
+
}
|
|
143
|
+
return { pass: false, reason: `${label} does not end with "${value}"` };
|
|
144
|
+
}
|
|
145
|
+
checkMaxWords(value, output) {
|
|
146
|
+
if (typeof value !== 'number') {
|
|
147
|
+
return { pass: false, reason: '"max_words" check requires a numeric value' };
|
|
148
|
+
}
|
|
149
|
+
const wordCount = this.countWords(output);
|
|
150
|
+
if (wordCount <= value) {
|
|
151
|
+
return { pass: true, reason: `Word count ${wordCount} <= ${value}` };
|
|
152
|
+
}
|
|
153
|
+
return { pass: false, reason: `Word count ${wordCount} exceeds max ${value}` };
|
|
154
|
+
}
|
|
155
|
+
checkMinWords(value, output) {
|
|
156
|
+
if (typeof value !== 'number') {
|
|
157
|
+
return { pass: false, reason: '"min_words" check requires a numeric value' };
|
|
158
|
+
}
|
|
159
|
+
const wordCount = this.countWords(output);
|
|
160
|
+
if (wordCount >= value) {
|
|
161
|
+
return { pass: true, reason: `Word count ${wordCount} >= ${value}` };
|
|
162
|
+
}
|
|
163
|
+
return { pass: false, reason: `Word count ${wordCount} below min ${value}` };
|
|
164
|
+
}
|
|
165
|
+
countWords(text) {
|
|
166
|
+
return text.trim().split(/\s+/).filter(w => w.length > 0).length;
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Rough token estimation: ~4 characters per token (GPT-family average).
|
|
170
|
+
* This is intentionally approximate — for precise counting, use a tokenizer.
|
|
171
|
+
*/
|
|
172
|
+
estimateTokens(text) {
|
|
173
|
+
return Math.ceil(text.length / 4);
|
|
174
|
+
}
|
|
175
|
+
toStringArray(value) {
|
|
176
|
+
if (value === undefined || value === null)
|
|
177
|
+
return [];
|
|
178
|
+
if (Array.isArray(value))
|
|
179
|
+
return value.map(String);
|
|
180
|
+
return [String(value)];
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
exports.NlpEvaluator = NlpEvaluator;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Prmd Evaluator - LLM-based evaluation via @prompd/cli.
|
|
3
|
+
*
|
|
4
|
+
* Modes:
|
|
5
|
+
* - prompt: "@scope/pkg@version" -> uses a registry package as the evaluator
|
|
6
|
+
* - prompt: "./path" -> uses a local .prmd file as the evaluator
|
|
7
|
+
* - (no prompt field) -> uses the content block of the .test.prmd
|
|
8
|
+
*
|
|
9
|
+
* The evaluator prompt receives {{input}}, {{output}}, and {{params}} variables.
|
|
10
|
+
* Response must start with PASS or FAIL.
|
|
11
|
+
*/
|
|
12
|
+
import type { Evaluator, EvaluatorContext } from './types';
|
|
13
|
+
import type { AssertionDef, AssertionResult } from '../types';
|
|
14
|
+
import type { CompilerModule } from '../cli-types';
|
|
15
|
+
export interface PrmdEvaluatorOptions {
|
|
16
|
+
testFileDir: string;
|
|
17
|
+
evaluatorPrompt?: string;
|
|
18
|
+
workspaceRoot?: string;
|
|
19
|
+
registryUrl?: string;
|
|
20
|
+
cliModule?: CompilerModule;
|
|
21
|
+
provider?: string;
|
|
22
|
+
model?: string;
|
|
23
|
+
}
|
|
24
|
+
export declare class PrmdEvaluator implements Evaluator {
|
|
25
|
+
readonly type = "prmd";
|
|
26
|
+
private options;
|
|
27
|
+
private cliModule;
|
|
28
|
+
constructor(options: PrmdEvaluatorOptions);
|
|
29
|
+
evaluate(assertion: AssertionDef, context: EvaluatorContext): Promise<AssertionResult>;
|
|
30
|
+
private resolveEvaluatorContent;
|
|
31
|
+
private resolvePromptTarget;
|
|
32
|
+
/**
|
|
33
|
+
* Wrap a registry reference as a minimal .prmd that inherits from the evaluator package.
|
|
34
|
+
* The compiler handles resolution, download, and caching.
|
|
35
|
+
*/
|
|
36
|
+
private wrapAsInherits;
|
|
37
|
+
private compileEvaluator;
|
|
38
|
+
private parseEvaluatorResponse;
|
|
39
|
+
private getDefaultModel;
|
|
40
|
+
private getCli;
|
|
41
|
+
}
|
|
42
|
+
//# sourceMappingURL=PrmdEvaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"PrmdEvaluator.d.ts","sourceRoot":"","sources":["../../src/evaluators/PrmdEvaluator.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAIH,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAC9D,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAInD,MAAM,WAAW,oBAAoB;IACnC,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,cAAc,CAAC;IAC3B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED,qBAAa,aAAc,YAAW,SAAS;IAC7C,QAAQ,CAAC,IAAI,UAAU;IACvB,OAAO,CAAC,OAAO,CAAuB;IACtC,OAAO,CAAC,SAAS,CAA+B;gBAEpC,OAAO,EAAE,oBAAoB;IAOnC,QAAQ,CAAC,SAAS,EAAE,YAAY,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,CAAC;YA+F9E,uBAAuB;YAUvB,mBAAmB;IAejC;;;OAGG;IACH,OAAO,CAAC,cAAc;YAgBR,gBAAgB;IAyD9B,OAAO,CAAC,sBAAsB;IAwB9B,OAAO,CAAC,eAAe;YAYT,MAAM;CAQrB"}
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Prmd Evaluator - LLM-based evaluation via @prompd/cli.
|
|
4
|
+
*
|
|
5
|
+
* Modes:
|
|
6
|
+
* - prompt: "@scope/pkg@version" -> uses a registry package as the evaluator
|
|
7
|
+
* - prompt: "./path" -> uses a local .prmd file as the evaluator
|
|
8
|
+
* - (no prompt field) -> uses the content block of the .test.prmd
|
|
9
|
+
*
|
|
10
|
+
* The evaluator prompt receives {{input}}, {{output}}, and {{params}} variables.
|
|
11
|
+
* Response must start with PASS or FAIL.
|
|
12
|
+
*/
|
|
13
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
14
|
+
if (k2 === undefined) k2 = k;
|
|
15
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
16
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
17
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
18
|
+
}
|
|
19
|
+
Object.defineProperty(o, k2, desc);
|
|
20
|
+
}) : (function(o, m, k, k2) {
|
|
21
|
+
if (k2 === undefined) k2 = k;
|
|
22
|
+
o[k2] = m[k];
|
|
23
|
+
}));
|
|
24
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
25
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
26
|
+
}) : function(o, v) {
|
|
27
|
+
o["default"] = v;
|
|
28
|
+
});
|
|
29
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
30
|
+
var ownKeys = function(o) {
|
|
31
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
32
|
+
var ar = [];
|
|
33
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
34
|
+
return ar;
|
|
35
|
+
};
|
|
36
|
+
return ownKeys(o);
|
|
37
|
+
};
|
|
38
|
+
return function (mod) {
|
|
39
|
+
if (mod && mod.__esModule) return mod;
|
|
40
|
+
var result = {};
|
|
41
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
42
|
+
__setModuleDefault(result, mod);
|
|
43
|
+
return result;
|
|
44
|
+
};
|
|
45
|
+
})();
|
|
46
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
47
|
+
exports.PrmdEvaluator = void 0;
|
|
48
|
+
const path = __importStar(require("path"));
|
|
49
|
+
const fs = __importStar(require("fs"));
|
|
50
|
+
const PASS_FAIL_REGEX = /^(PASS|FAIL)[:\s]*(.*)/i;
|
|
51
|
+
class PrmdEvaluator {
|
|
52
|
+
constructor(options) {
|
|
53
|
+
this.type = 'prmd';
|
|
54
|
+
this.cliModule = null;
|
|
55
|
+
this.options = options;
|
|
56
|
+
if (options.cliModule) {
|
|
57
|
+
this.cliModule = options.cliModule;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
async evaluate(assertion, context) {
|
|
61
|
+
const start = Date.now();
|
|
62
|
+
try {
|
|
63
|
+
const evaluatorContent = await this.resolveEvaluatorContent(assertion);
|
|
64
|
+
console.log(`[PrmdEvaluator] Resolved evaluator content (${evaluatorContent?.length || 0} chars)`);
|
|
65
|
+
if (evaluatorContent) {
|
|
66
|
+
console.log(`[PrmdEvaluator] source: ${assertion.prompt || 'content block'}`);
|
|
67
|
+
console.log(`[PrmdEvaluator] preview: ${evaluatorContent.substring(0, 150)}`);
|
|
68
|
+
}
|
|
69
|
+
if (!evaluatorContent) {
|
|
70
|
+
return {
|
|
71
|
+
evaluator: 'prmd',
|
|
72
|
+
status: 'error',
|
|
73
|
+
reason: 'Could not resolve evaluator prompt content',
|
|
74
|
+
duration: Date.now() - start,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
// Compile the evaluator prompt with context as parameters
|
|
78
|
+
const cli = await this.getCli();
|
|
79
|
+
const compiled = await this.compileEvaluator(cli, evaluatorContent, context);
|
|
80
|
+
console.log(`[PrmdEvaluator] Compiled evaluator (${compiled?.length || 0} chars): ${compiled?.substring(0, 150) || 'null'}`);
|
|
81
|
+
if (!compiled) {
|
|
82
|
+
return {
|
|
83
|
+
evaluator: 'prmd',
|
|
84
|
+
status: 'error',
|
|
85
|
+
reason: 'Evaluator prompt compilation failed',
|
|
86
|
+
duration: Date.now() - start,
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
// Execute against LLM using callLLM directly (avoids executeRawText re-compilation)
|
|
90
|
+
const executor = new cli.PrompdExecutor();
|
|
91
|
+
// Resolve provider/model/apiKey — same logic as TestRunner
|
|
92
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
93
|
+
const configManager = cli.ConfigManager?.getInstance
|
|
94
|
+
? cli.ConfigManager.getInstance()
|
|
95
|
+
: null;
|
|
96
|
+
const config = configManager?.config || {};
|
|
97
|
+
// Priority: assertion-level > run options (UI selector) > config defaults
|
|
98
|
+
const provider = assertion.provider || this.options.provider || config.defaultProvider || 'openai';
|
|
99
|
+
const rawModel = assertion.model || this.options.model || config.default_model || config.defaultModel || '';
|
|
100
|
+
const model = rawModel || this.getDefaultModel(provider);
|
|
101
|
+
const apiKey = configManager?.getApiKey?.(provider, config) || '';
|
|
102
|
+
console.log(`[PrmdEvaluator] Executing: provider=${provider}, model=${model}`);
|
|
103
|
+
if (!apiKey && provider !== 'ollama') {
|
|
104
|
+
return {
|
|
105
|
+
evaluator: 'prmd',
|
|
106
|
+
status: 'error',
|
|
107
|
+
reason: `No API key configured for provider "${provider}"`,
|
|
108
|
+
duration: Date.now() - start,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
const execResult = await executor.callLLM(provider, model, compiled, apiKey);
|
|
112
|
+
if (!execResult.success) {
|
|
113
|
+
return {
|
|
114
|
+
evaluator: 'prmd',
|
|
115
|
+
status: 'error',
|
|
116
|
+
reason: execResult.error || 'Evaluator LLM execution failed',
|
|
117
|
+
duration: Date.now() - start,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
const response = execResult.response || execResult.content || '';
|
|
121
|
+
if (!response) {
|
|
122
|
+
return {
|
|
123
|
+
evaluator: 'prmd',
|
|
124
|
+
status: 'error',
|
|
125
|
+
reason: 'No response from evaluator',
|
|
126
|
+
duration: Date.now() - start,
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
// Parse PASS/FAIL from response
|
|
130
|
+
return this.parseEvaluatorResponse(response, Date.now() - start);
|
|
131
|
+
}
|
|
132
|
+
catch (err) {
|
|
133
|
+
return {
|
|
134
|
+
evaluator: 'prmd',
|
|
135
|
+
status: 'error',
|
|
136
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
137
|
+
duration: Date.now() - start,
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
async resolveEvaluatorContent(assertion) {
|
|
142
|
+
// If prompt: is specified, resolve it (registry ref, local file)
|
|
143
|
+
if (assertion.prompt) {
|
|
144
|
+
return this.resolvePromptTarget(assertion.prompt);
|
|
145
|
+
}
|
|
146
|
+
// No prompt: field — use the content block of the .test.prmd
|
|
147
|
+
return this.options.evaluatorPrompt || null;
|
|
148
|
+
}
|
|
149
|
+
async resolvePromptTarget(prompt) {
|
|
150
|
+
// Registry reference: @scope/package@version
|
|
151
|
+
if (prompt.startsWith('@')) {
|
|
152
|
+
return this.wrapAsInherits(prompt);
|
|
153
|
+
}
|
|
154
|
+
// Local file path
|
|
155
|
+
const resolved = path.resolve(this.options.testFileDir, prompt);
|
|
156
|
+
if (!fs.existsSync(resolved)) {
|
|
157
|
+
throw new Error(`Evaluator prompt file not found: ${resolved}`);
|
|
158
|
+
}
|
|
159
|
+
return fs.readFileSync(resolved, 'utf-8');
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* Wrap a registry reference as a minimal .prmd that inherits from the evaluator package.
|
|
163
|
+
* The compiler handles resolution, download, and caching.
|
|
164
|
+
*/
|
|
165
|
+
wrapAsInherits(registryRef) {
|
|
166
|
+
return [
|
|
167
|
+
'---',
|
|
168
|
+
`inherits: "${registryRef}"`,
|
|
169
|
+
'parameters:',
|
|
170
|
+
' - name: prompt',
|
|
171
|
+
' type: string',
|
|
172
|
+
' - name: response',
|
|
173
|
+
' type: string',
|
|
174
|
+
' - name: params',
|
|
175
|
+
' type: string',
|
|
176
|
+
'---',
|
|
177
|
+
'',
|
|
178
|
+
].join('\n');
|
|
179
|
+
}
|
|
180
|
+
async compileEvaluator(cli, content, context) {
|
|
181
|
+
// If content doesn't start with frontmatter, wrap it with minimal frontmatter
|
|
182
|
+
// so the compiler can process it. Content blocks from .test.prmd are raw markdown.
|
|
183
|
+
let prmdContent = content;
|
|
184
|
+
if (!content.trimStart().startsWith('---')) {
|
|
185
|
+
prmdContent = [
|
|
186
|
+
'---',
|
|
187
|
+
'id: evaluator',
|
|
188
|
+
'name: "Test Evaluator"',
|
|
189
|
+
'version: 0.0.1',
|
|
190
|
+
'parameters:',
|
|
191
|
+
' - name: prompt',
|
|
192
|
+
' type: string',
|
|
193
|
+
' - name: response',
|
|
194
|
+
' type: string',
|
|
195
|
+
' - name: params',
|
|
196
|
+
' type: object',
|
|
197
|
+
'---',
|
|
198
|
+
'',
|
|
199
|
+
content,
|
|
200
|
+
].join('\n');
|
|
201
|
+
}
|
|
202
|
+
const memFs = new cli.MemoryFileSystem({ '/evaluator.prmd': prmdContent });
|
|
203
|
+
const compiler = new cli.PrompdCompiler();
|
|
204
|
+
// Inject evaluation context as template variables
|
|
205
|
+
const parameters = {
|
|
206
|
+
prompt: context.prompt,
|
|
207
|
+
response: context.response,
|
|
208
|
+
params: JSON.stringify(context.params, null, 2),
|
|
209
|
+
};
|
|
210
|
+
// Also expose individual params via dot notation
|
|
211
|
+
for (const [key, value] of Object.entries(context.params)) {
|
|
212
|
+
parameters[`params.${key}`] = String(value);
|
|
213
|
+
}
|
|
214
|
+
const result = await compiler.compile('/evaluator.prmd', {
|
|
215
|
+
outputFormat: 'markdown',
|
|
216
|
+
parameters,
|
|
217
|
+
fileSystem: memFs,
|
|
218
|
+
workspaceRoot: this.options.workspaceRoot,
|
|
219
|
+
registryUrl: this.options.registryUrl,
|
|
220
|
+
});
|
|
221
|
+
// CLI compile() may return a string directly or an object
|
|
222
|
+
if (typeof result === 'string') {
|
|
223
|
+
return result || null;
|
|
224
|
+
}
|
|
225
|
+
return result.output || null;
|
|
226
|
+
}
|
|
227
|
+
parseEvaluatorResponse(response, duration) {
|
|
228
|
+
const firstLine = response.trim().split('\n')[0];
|
|
229
|
+
const match = firstLine.match(PASS_FAIL_REGEX);
|
|
230
|
+
if (!match) {
|
|
231
|
+
return {
|
|
232
|
+
evaluator: 'prmd',
|
|
233
|
+
status: 'error',
|
|
234
|
+
reason: `Evaluator response did not start with PASS or FAIL. Got: "${firstLine.substring(0, 100)}"`,
|
|
235
|
+
duration,
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
const verdict = match[1].toUpperCase();
|
|
239
|
+
const reason = match[2]?.trim() || undefined;
|
|
240
|
+
return {
|
|
241
|
+
evaluator: 'prmd',
|
|
242
|
+
status: verdict === 'PASS' ? 'pass' : 'fail',
|
|
243
|
+
reason: reason || `Evaluator returned ${verdict}`,
|
|
244
|
+
duration,
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
getDefaultModel(provider) {
|
|
248
|
+
const defaults = {
|
|
249
|
+
openai: 'gpt-4o',
|
|
250
|
+
anthropic: 'claude-sonnet-4-20250514',
|
|
251
|
+
groq: 'llama-3.1-70b-versatile',
|
|
252
|
+
google: 'gemini-2.0-flash',
|
|
253
|
+
mistral: 'mistral-large-latest',
|
|
254
|
+
deepseek: 'deepseek-chat',
|
|
255
|
+
};
|
|
256
|
+
return defaults[provider.toLowerCase()] || 'gpt-4o';
|
|
257
|
+
}
|
|
258
|
+
async getCli() {
|
|
259
|
+
if (!this.cliModule) {
|
|
260
|
+
throw new Error('@prompd/cli module not provided. Pass it via PrmdEvaluatorOptions.cliModule');
|
|
261
|
+
}
|
|
262
|
+
return this.cliModule;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
exports.PrmdEvaluator = PrmdEvaluator;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Script Evaluator - runs external scripts with stdin/stdout contract.
|
|
3
|
+
*
|
|
4
|
+
* Contract:
|
|
5
|
+
* - Receives JSON on stdin: { input, output, params, metadata }
|
|
6
|
+
* - Exit code 0 = PASS, 1 = FAIL, other = ERROR
|
|
7
|
+
* - Stdout = reason (optional)
|
|
8
|
+
*/
|
|
9
|
+
import type { Evaluator, EvaluatorContext } from './types';
|
|
10
|
+
import type { AssertionDef, AssertionResult } from '../types';
|
|
11
|
+
export declare class ScriptEvaluator implements Evaluator {
|
|
12
|
+
readonly type = "script";
|
|
13
|
+
private testFileDir;
|
|
14
|
+
constructor(testFileDir: string);
|
|
15
|
+
evaluate(assertion: AssertionDef, context: EvaluatorContext): Promise<AssertionResult>;
|
|
16
|
+
private runScript;
|
|
17
|
+
private getRunner;
|
|
18
|
+
}
|
|
19
|
+
//# sourceMappingURL=ScriptEvaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ScriptEvaluator.d.ts","sourceRoot":"","sources":["../../src/evaluators/ScriptEvaluator.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,OAAO,KAAK,EAAE,SAAS,EAAE,gBAAgB,EAAE,MAAM,SAAS,CAAC;AAC3D,OAAO,KAAK,EAAE,YAAY,EAAE,eAAe,EAAkB,MAAM,UAAU,CAAC;AAI9E,qBAAa,eAAgB,YAAW,SAAS;IAC/C,QAAQ,CAAC,IAAI,YAAY;IACzB,OAAO,CAAC,WAAW,CAAS;gBAEhB,WAAW,EAAE,MAAM;IAIzB,QAAQ,CAAC,SAAS,EAAE,YAAY,EAAE,OAAO,EAAE,gBAAgB,GAAG,OAAO,CAAC,eAAe,CAAC;IAsD5F,OAAO,CAAC,SAAS;IAoDjB,OAAO,CAAC,SAAS;CAoBlB"}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Script Evaluator - runs external scripts with stdin/stdout contract.
|
|
4
|
+
*
|
|
5
|
+
* Contract:
|
|
6
|
+
* - Receives JSON on stdin: { input, output, params, metadata }
|
|
7
|
+
* - Exit code 0 = PASS, 1 = FAIL, other = ERROR
|
|
8
|
+
* - Stdout = reason (optional)
|
|
9
|
+
*/
|
|
10
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
13
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
14
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
15
|
+
}
|
|
16
|
+
Object.defineProperty(o, k2, desc);
|
|
17
|
+
}) : (function(o, m, k, k2) {
|
|
18
|
+
if (k2 === undefined) k2 = k;
|
|
19
|
+
o[k2] = m[k];
|
|
20
|
+
}));
|
|
21
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
22
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
23
|
+
}) : function(o, v) {
|
|
24
|
+
o["default"] = v;
|
|
25
|
+
});
|
|
26
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
27
|
+
var ownKeys = function(o) {
|
|
28
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
29
|
+
var ar = [];
|
|
30
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
31
|
+
return ar;
|
|
32
|
+
};
|
|
33
|
+
return ownKeys(o);
|
|
34
|
+
};
|
|
35
|
+
return function (mod) {
|
|
36
|
+
if (mod && mod.__esModule) return mod;
|
|
37
|
+
var result = {};
|
|
38
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
39
|
+
__setModuleDefault(result, mod);
|
|
40
|
+
return result;
|
|
41
|
+
};
|
|
42
|
+
})();
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
exports.ScriptEvaluator = void 0;
|
|
45
|
+
const child_process_1 = require("child_process");
|
|
46
|
+
const path = __importStar(require("path"));
|
|
47
|
+
const fs = __importStar(require("fs"));
|
|
48
|
+
const SCRIPT_TIMEOUT_MS = 30000;
|
|
49
|
+
class ScriptEvaluator {
|
|
50
|
+
constructor(testFileDir) {
|
|
51
|
+
this.type = 'script';
|
|
52
|
+
this.testFileDir = testFileDir;
|
|
53
|
+
}
|
|
54
|
+
async evaluate(assertion, context) {
|
|
55
|
+
const start = Date.now();
|
|
56
|
+
const scriptPath = assertion.run;
|
|
57
|
+
if (!scriptPath) {
|
|
58
|
+
return {
|
|
59
|
+
evaluator: 'script',
|
|
60
|
+
status: 'error',
|
|
61
|
+
reason: 'No "run" path specified for script evaluator',
|
|
62
|
+
duration: Date.now() - start,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
const resolvedPath = path.resolve(this.testFileDir, scriptPath);
|
|
66
|
+
if (!fs.existsSync(resolvedPath)) {
|
|
67
|
+
return {
|
|
68
|
+
evaluator: 'script',
|
|
69
|
+
status: 'error',
|
|
70
|
+
reason: `Script not found: ${resolvedPath}`,
|
|
71
|
+
duration: Date.now() - start,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
// Validate script stays within the test file's directory tree
|
|
75
|
+
const normalizedScript = path.normalize(resolvedPath);
|
|
76
|
+
const normalizedBase = path.normalize(this.testFileDir);
|
|
77
|
+
if (!normalizedScript.startsWith(normalizedBase)) {
|
|
78
|
+
return {
|
|
79
|
+
evaluator: 'script',
|
|
80
|
+
status: 'error',
|
|
81
|
+
reason: `Script path escapes test directory: ${scriptPath}`,
|
|
82
|
+
duration: Date.now() - start,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
try {
|
|
86
|
+
const result = await this.runScript(resolvedPath, context, assertion);
|
|
87
|
+
return {
|
|
88
|
+
evaluator: 'script',
|
|
89
|
+
status: result.exitCode === 0 ? 'pass' : 'fail',
|
|
90
|
+
reason: result.stdout.trim() || (result.exitCode === 0 ? 'Script passed' : 'Script failed'),
|
|
91
|
+
duration: Date.now() - start,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
catch (err) {
|
|
95
|
+
return {
|
|
96
|
+
evaluator: 'script',
|
|
97
|
+
status: 'error',
|
|
98
|
+
reason: err instanceof Error ? err.message : String(err),
|
|
99
|
+
duration: Date.now() - start,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
runScript(scriptPath, context, assertion) {
|
|
104
|
+
return new Promise((resolve, reject) => {
|
|
105
|
+
const { command, args } = this.getRunner(scriptPath);
|
|
106
|
+
const child = (0, child_process_1.spawn)(command, args, {
|
|
107
|
+
cwd: this.testFileDir,
|
|
108
|
+
timeout: SCRIPT_TIMEOUT_MS,
|
|
109
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
110
|
+
shell: process.platform === 'win32',
|
|
111
|
+
});
|
|
112
|
+
let stdout = '';
|
|
113
|
+
let stderr = '';
|
|
114
|
+
child.stdout.on('data', (data) => {
|
|
115
|
+
stdout += data.toString();
|
|
116
|
+
});
|
|
117
|
+
child.stderr.on('data', (data) => {
|
|
118
|
+
stderr += data.toString();
|
|
119
|
+
});
|
|
120
|
+
child.on('error', (err) => {
|
|
121
|
+
reject(new Error(`Failed to spawn script: ${err.message}`));
|
|
122
|
+
});
|
|
123
|
+
child.on('close', (code) => {
|
|
124
|
+
if (code === null) {
|
|
125
|
+
reject(new Error('Script process was killed (timeout or signal)'));
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
resolve({ exitCode: code, stdout, stderr });
|
|
129
|
+
});
|
|
130
|
+
// Send context as JSON on stdin, include target so script knows what to evaluate
|
|
131
|
+
const target = assertion.evaluate || 'response';
|
|
132
|
+
const payload = JSON.stringify({
|
|
133
|
+
target,
|
|
134
|
+
prompt: context.prompt,
|
|
135
|
+
response: context.response,
|
|
136
|
+
params: context.params,
|
|
137
|
+
metadata: context.metadata,
|
|
138
|
+
});
|
|
139
|
+
child.stdin.write(payload);
|
|
140
|
+
child.stdin.end();
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
getRunner(scriptPath) {
|
|
144
|
+
const ext = path.extname(scriptPath).toLowerCase();
|
|
145
|
+
switch (ext) {
|
|
146
|
+
case '.ts':
|
|
147
|
+
return { command: 'npx', args: ['tsx', scriptPath] };
|
|
148
|
+
case '.js':
|
|
149
|
+
case '.mjs':
|
|
150
|
+
return { command: 'node', args: [scriptPath] };
|
|
151
|
+
case '.py':
|
|
152
|
+
return { command: 'python', args: [scriptPath] };
|
|
153
|
+
case '.sh':
|
|
154
|
+
return { command: 'bash', args: [scriptPath] };
|
|
155
|
+
case '.ps1':
|
|
156
|
+
return { command: 'powershell', args: ['-File', scriptPath] };
|
|
157
|
+
default:
|
|
158
|
+
// For unknown extensions, try running directly (relies on shebang or OS association)
|
|
159
|
+
return { command: scriptPath, args: [] };
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
exports.ScriptEvaluator = ScriptEvaluator;
|