@prompd/test 0.5.0-beta.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/dist/EvaluatorEngine.d.ts +32 -0
  2. package/dist/EvaluatorEngine.d.ts.map +1 -0
  3. package/dist/EvaluatorEngine.js +97 -0
  4. package/dist/TestDiscovery.d.ts +28 -0
  5. package/dist/TestDiscovery.d.ts.map +1 -0
  6. package/dist/TestDiscovery.js +137 -0
  7. package/dist/TestParser.d.ts +25 -0
  8. package/dist/TestParser.d.ts.map +1 -0
  9. package/dist/TestParser.js +187 -0
  10. package/dist/TestRunner.d.ts +57 -0
  11. package/dist/TestRunner.d.ts.map +1 -0
  12. package/dist/TestRunner.js +463 -0
  13. package/dist/cli-types.d.ts +62 -0
  14. package/dist/cli-types.d.ts.map +1 -0
  15. package/dist/cli-types.js +6 -0
  16. package/dist/evaluators/NlpEvaluator.d.ts +30 -0
  17. package/dist/evaluators/NlpEvaluator.d.ts.map +1 -0
  18. package/dist/evaluators/NlpEvaluator.js +183 -0
  19. package/dist/evaluators/PrmdEvaluator.d.ts +42 -0
  20. package/dist/evaluators/PrmdEvaluator.d.ts.map +1 -0
  21. package/dist/evaluators/PrmdEvaluator.js +265 -0
  22. package/dist/evaluators/ScriptEvaluator.d.ts +19 -0
  23. package/dist/evaluators/ScriptEvaluator.d.ts.map +1 -0
  24. package/dist/evaluators/ScriptEvaluator.js +163 -0
  25. package/dist/evaluators/types.d.ts +19 -0
  26. package/dist/evaluators/types.d.ts.map +1 -0
  27. package/dist/evaluators/types.js +5 -0
  28. package/dist/index.d.ts +25 -0
  29. package/dist/index.d.ts.map +1 -0
  30. package/dist/index.js +33 -0
  31. package/dist/reporters/ConsoleReporter.d.ts +17 -0
  32. package/dist/reporters/ConsoleReporter.d.ts.map +1 -0
  33. package/dist/reporters/ConsoleReporter.js +85 -0
  34. package/dist/reporters/JsonReporter.d.ts +11 -0
  35. package/dist/reporters/JsonReporter.d.ts.map +1 -0
  36. package/dist/reporters/JsonReporter.js +18 -0
  37. package/dist/reporters/JunitReporter.d.ts +15 -0
  38. package/dist/reporters/JunitReporter.d.ts.map +1 -0
  39. package/dist/reporters/JunitReporter.js +89 -0
  40. package/dist/reporters/types.d.ts +8 -0
  41. package/dist/reporters/types.d.ts.map +1 -0
  42. package/dist/reporters/types.js +5 -0
  43. package/dist/types.d.ts +119 -0
  44. package/dist/types.d.ts.map +1 -0
  45. package/dist/types.js +5 -0
  46. package/package.json +34 -0
  47. package/src/EvaluatorEngine.ts +130 -0
  48. package/src/TestDiscovery.ts +133 -0
  49. package/src/TestParser.ts +235 -0
  50. package/src/TestRunner.ts +516 -0
  51. package/src/cli-types.ts +92 -0
  52. package/src/evaluators/NlpEvaluator.ts +240 -0
  53. package/src/evaluators/PrmdEvaluator.ts +284 -0
  54. package/src/evaluators/ScriptEvaluator.ts +152 -0
  55. package/src/evaluators/types.ts +24 -0
  56. package/src/index.ts +76 -0
  57. package/src/reporters/ConsoleReporter.ts +100 -0
  58. package/src/reporters/JsonReporter.ts +21 -0
  59. package/src/reporters/JunitReporter.ts +113 -0
  60. package/src/reporters/types.ts +9 -0
  61. package/src/types.ts +140 -0
  62. package/tsconfig.json +20 -0
@@ -0,0 +1,240 @@
1
+ /**
2
+ * NLP Evaluator - local, fast, free, deterministic assertions.
3
+ *
4
+ * Checks: contains, not_contains, matches, max_tokens, min_tokens, starts_with, ends_with
5
+ */
6
+
7
+ import type { Evaluator, EvaluatorContext } from './types';
8
+ import type { AssertionDef, AssertionResult, NlpCheck, EvaluateTarget } from '../types';
9
+
10
+ export class NlpEvaluator implements Evaluator {
11
+ readonly type = 'nlp';
12
+
13
+ async evaluate(assertion: AssertionDef, context: EvaluatorContext): Promise<AssertionResult> {
14
+ const start = Date.now();
15
+ const check = assertion.check as NlpCheck;
16
+ const target: EvaluateTarget = assertion.evaluate || 'response';
17
+
18
+ try {
19
+ const text = this.resolveTarget(target, context);
20
+ const targetLabel = target === 'both' ? 'Prompt+Response' : target === 'prompt' ? 'Prompt' : 'Output';
21
+ const result = this.runCheck(check, assertion.value, text, targetLabel);
22
+ return {
23
+ evaluator: 'nlp',
24
+ check,
25
+ status: result.pass ? 'pass' : 'fail',
26
+ reason: result.reason,
27
+ duration: Date.now() - start,
28
+ };
29
+ } catch (err) {
30
+ return {
31
+ evaluator: 'nlp',
32
+ check,
33
+ status: 'error',
34
+ reason: err instanceof Error ? err.message : String(err),
35
+ duration: Date.now() - start,
36
+ };
37
+ }
38
+ }
39
+
40
+ private resolveTarget(target: EvaluateTarget, context: EvaluatorContext): string {
41
+ switch (target) {
42
+ case 'prompt': return context.prompt;
43
+ case 'both': return `${context.prompt}\n\n${context.response}`;
44
+ case 'response':
45
+ default: return context.response;
46
+ }
47
+ }
48
+
49
+ private runCheck(
50
+ check: NlpCheck,
51
+ value: string | string[] | number | undefined,
52
+ output: string,
53
+ label: string = 'Output'
54
+ ): { pass: boolean; reason: string } {
55
+ switch (check) {
56
+ case 'contains':
57
+ return this.checkContains(value, output, label);
58
+ case 'not_contains':
59
+ return this.checkNotContains(value, output, label);
60
+ case 'matches':
61
+ return this.checkMatches(value, output, label);
62
+ case 'max_tokens':
63
+ return this.checkMaxTokens(value, output);
64
+ case 'min_tokens':
65
+ return this.checkMinTokens(value, output);
66
+ case 'max_words':
67
+ return this.checkMaxWords(value, output);
68
+ case 'min_words':
69
+ return this.checkMinWords(value, output);
70
+ case 'starts_with':
71
+ return this.checkStartsWith(value, output, label);
72
+ case 'ends_with':
73
+ return this.checkEndsWith(value, output, label);
74
+ default:
75
+ return { pass: false, reason: `Unknown NLP check: ${check}` };
76
+ }
77
+ }
78
+
79
+ private checkContains(
80
+ value: string | string[] | number | undefined,
81
+ output: string,
82
+ label: string
83
+ ): { pass: boolean; reason: string } {
84
+ const values = this.toStringArray(value);
85
+ const lower = output.toLowerCase();
86
+ const missing = values.filter(v => !lower.includes(v.toLowerCase()));
87
+
88
+ if (missing.length === 0) {
89
+ return { pass: true, reason: `${label} contains all expected values` };
90
+ }
91
+ return {
92
+ pass: false,
93
+ reason: `${label} missing: ${missing.map(v => `"${v}"`).join(', ')}`,
94
+ };
95
+ }
96
+
97
+ private checkNotContains(
98
+ value: string | string[] | number | undefined,
99
+ output: string,
100
+ label: string
101
+ ): { pass: boolean; reason: string } {
102
+ const values = this.toStringArray(value);
103
+ const lower = output.toLowerCase();
104
+ const found = values.filter(v => lower.includes(v.toLowerCase()));
105
+
106
+ if (found.length === 0) {
107
+ return { pass: true, reason: `${label} does not contain any excluded values` };
108
+ }
109
+ return {
110
+ pass: false,
111
+ reason: `${label} contains excluded values: ${found.map(v => `"${v}"`).join(', ')}`,
112
+ };
113
+ }
114
+
115
+ private checkMatches(
116
+ value: string | string[] | number | undefined,
117
+ output: string,
118
+ label: string
119
+ ): { pass: boolean; reason: string } {
120
+ if (typeof value !== 'string') {
121
+ return { pass: false, reason: '"matches" check requires a string regex pattern' };
122
+ }
123
+
124
+ const regex = new RegExp(value);
125
+ if (regex.test(output)) {
126
+ return { pass: true, reason: `${label} matches pattern /${value}/` };
127
+ }
128
+ return { pass: false, reason: `${label} does not match pattern /${value}/` };
129
+ }
130
+
131
+ private checkMaxTokens(
132
+ value: string | string[] | number | undefined,
133
+ output: string
134
+ ): { pass: boolean; reason: string } {
135
+ if (typeof value !== 'number') {
136
+ return { pass: false, reason: '"max_tokens" check requires a numeric value' };
137
+ }
138
+
139
+ const tokenCount = this.estimateTokens(output);
140
+ if (tokenCount <= value) {
141
+ return { pass: true, reason: `Token count ${tokenCount} <= ${value}` };
142
+ }
143
+ return { pass: false, reason: `Token count ${tokenCount} exceeds max ${value}` };
144
+ }
145
+
146
+ private checkMinTokens(
147
+ value: string | string[] | number | undefined,
148
+ output: string
149
+ ): { pass: boolean; reason: string } {
150
+ if (typeof value !== 'number') {
151
+ return { pass: false, reason: '"min_tokens" check requires a numeric value' };
152
+ }
153
+
154
+ const tokenCount = this.estimateTokens(output);
155
+ if (tokenCount >= value) {
156
+ return { pass: true, reason: `Token count ${tokenCount} >= ${value}` };
157
+ }
158
+ return { pass: false, reason: `Token count ${tokenCount} below min ${value}` };
159
+ }
160
+
161
+ private checkStartsWith(
162
+ value: string | string[] | number | undefined,
163
+ output: string,
164
+ label: string
165
+ ): { pass: boolean; reason: string } {
166
+ if (typeof value !== 'string') {
167
+ return { pass: false, reason: '"starts_with" check requires a string value' };
168
+ }
169
+
170
+ const trimmed = output.trimStart();
171
+ if (trimmed.toLowerCase().startsWith(value.toLowerCase())) {
172
+ return { pass: true, reason: `${label} starts with "${value}"` };
173
+ }
174
+ return { pass: false, reason: `${label} does not start with "${value}"` };
175
+ }
176
+
177
+ private checkEndsWith(
178
+ value: string | string[] | number | undefined,
179
+ output: string,
180
+ label: string
181
+ ): { pass: boolean; reason: string } {
182
+ if (typeof value !== 'string') {
183
+ return { pass: false, reason: '"ends_with" check requires a string value' };
184
+ }
185
+
186
+ const trimmed = output.trimEnd();
187
+ if (trimmed.toLowerCase().endsWith(value.toLowerCase())) {
188
+ return { pass: true, reason: `${label} ends with "${value}"` };
189
+ }
190
+ return { pass: false, reason: `${label} does not end with "${value}"` };
191
+ }
192
+
193
+ private checkMaxWords(
194
+ value: string | string[] | number | undefined,
195
+ output: string
196
+ ): { pass: boolean; reason: string } {
197
+ if (typeof value !== 'number') {
198
+ return { pass: false, reason: '"max_words" check requires a numeric value' };
199
+ }
200
+
201
+ const wordCount = this.countWords(output);
202
+ if (wordCount <= value) {
203
+ return { pass: true, reason: `Word count ${wordCount} <= ${value}` };
204
+ }
205
+ return { pass: false, reason: `Word count ${wordCount} exceeds max ${value}` };
206
+ }
207
+
208
+ private checkMinWords(
209
+ value: string | string[] | number | undefined,
210
+ output: string
211
+ ): { pass: boolean; reason: string } {
212
+ if (typeof value !== 'number') {
213
+ return { pass: false, reason: '"min_words" check requires a numeric value' };
214
+ }
215
+
216
+ const wordCount = this.countWords(output);
217
+ if (wordCount >= value) {
218
+ return { pass: true, reason: `Word count ${wordCount} >= ${value}` };
219
+ }
220
+ return { pass: false, reason: `Word count ${wordCount} below min ${value}` };
221
+ }
222
+
223
+ private countWords(text: string): number {
224
+ return text.trim().split(/\s+/).filter(w => w.length > 0).length;
225
+ }
226
+
227
+ /**
228
+ * Rough token estimation: ~4 characters per token (GPT-family average).
229
+ * This is intentionally approximate — for precise counting, use a tokenizer.
230
+ */
231
+ private estimateTokens(text: string): number {
232
+ return Math.ceil(text.length / 4);
233
+ }
234
+
235
+ private toStringArray(value: string | string[] | number | undefined): string[] {
236
+ if (value === undefined || value === null) return [];
237
+ if (Array.isArray(value)) return value.map(String);
238
+ return [String(value)];
239
+ }
240
+ }
@@ -0,0 +1,284 @@
1
+ /**
2
+ * Prmd Evaluator - LLM-based evaluation via @prompd/cli.
3
+ *
4
+ * Modes:
5
+ * - prompt: "@scope/pkg@version" -> uses a registry package as the evaluator
6
+ * - prompt: "./path" -> uses a local .prmd file as the evaluator
7
+ * - (no prompt field) -> uses the content block of the .test.prmd
8
+ *
9
+ * The evaluator prompt receives {{input}}, {{output}}, and {{params}} variables.
10
+ * Response must start with PASS or FAIL.
11
+ */
12
+
13
+ import * as path from 'path';
14
+ import * as fs from 'fs';
15
+ import type { Evaluator, EvaluatorContext } from './types';
16
+ import type { AssertionDef, AssertionResult } from '../types';
17
+ import type { CompilerModule } from '../cli-types';
18
+
19
+ const PASS_FAIL_REGEX = /^(PASS|FAIL)[:\s]*(.*)/i;
20
+
21
+ export interface PrmdEvaluatorOptions {
22
+ testFileDir: string;
23
+ evaluatorPrompt?: string;
24
+ workspaceRoot?: string;
25
+ registryUrl?: string;
26
+ cliModule?: CompilerModule;
27
+ provider?: string;
28
+ model?: string;
29
+ }
30
+
31
+ export class PrmdEvaluator implements Evaluator {
32
+ readonly type = 'prmd';
33
+ private options: PrmdEvaluatorOptions;
34
+ private cliModule: CompilerModule | null = null;
35
+
36
+ constructor(options: PrmdEvaluatorOptions) {
37
+ this.options = options;
38
+ if (options.cliModule) {
39
+ this.cliModule = options.cliModule;
40
+ }
41
+ }
42
+
43
+ async evaluate(assertion: AssertionDef, context: EvaluatorContext): Promise<AssertionResult> {
44
+ const start = Date.now();
45
+
46
+ try {
47
+ const evaluatorContent = await this.resolveEvaluatorContent(assertion);
48
+ console.log(`[PrmdEvaluator] Resolved evaluator content (${evaluatorContent?.length || 0} chars)`);
49
+ if (evaluatorContent) {
50
+ console.log(`[PrmdEvaluator] source: ${assertion.prompt || 'content block'}`);
51
+ console.log(`[PrmdEvaluator] preview: ${evaluatorContent.substring(0, 150)}`);
52
+ }
53
+
54
+ if (!evaluatorContent) {
55
+ return {
56
+ evaluator: 'prmd',
57
+ status: 'error',
58
+ reason: 'Could not resolve evaluator prompt content',
59
+ duration: Date.now() - start,
60
+ };
61
+ }
62
+
63
+ // Compile the evaluator prompt with context as parameters
64
+ const cli = await this.getCli();
65
+ const compiled = await this.compileEvaluator(cli, evaluatorContent, context);
66
+
67
+ console.log(`[PrmdEvaluator] Compiled evaluator (${compiled?.length || 0} chars): ${compiled?.substring(0, 150) || 'null'}`);
68
+
69
+ if (!compiled) {
70
+ return {
71
+ evaluator: 'prmd',
72
+ status: 'error',
73
+ reason: 'Evaluator prompt compilation failed',
74
+ duration: Date.now() - start,
75
+ };
76
+ }
77
+
78
+ // Execute against LLM using callLLM directly (avoids executeRawText re-compilation)
79
+ const executor = new cli.PrompdExecutor();
80
+
81
+ // Resolve provider/model/apiKey — same logic as TestRunner
82
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
83
+ const configManager = (cli as any).ConfigManager?.getInstance
84
+ ? (cli as any).ConfigManager.getInstance()
85
+ : null;
86
+ const config = configManager?.config || {};
87
+
88
+ // Priority: assertion-level > run options (UI selector) > config defaults
89
+ const provider = assertion.provider || this.options.provider || config.defaultProvider || 'openai';
90
+ const rawModel = assertion.model || this.options.model || config.default_model || config.defaultModel || '';
91
+ const model = rawModel || this.getDefaultModel(provider);
92
+ const apiKey = configManager?.getApiKey?.(provider, config) || '';
93
+
94
+ console.log(`[PrmdEvaluator] Executing: provider=${provider}, model=${model}`);
95
+
96
+ if (!apiKey && provider !== 'ollama') {
97
+ return {
98
+ evaluator: 'prmd',
99
+ status: 'error',
100
+ reason: `No API key configured for provider "${provider}"`,
101
+ duration: Date.now() - start,
102
+ };
103
+ }
104
+
105
+ const execResult = await executor.callLLM(provider, model, compiled, apiKey);
106
+
107
+ if (!execResult.success) {
108
+ return {
109
+ evaluator: 'prmd',
110
+ status: 'error',
111
+ reason: execResult.error || 'Evaluator LLM execution failed',
112
+ duration: Date.now() - start,
113
+ };
114
+ }
115
+
116
+ const response = execResult.response || execResult.content || '';
117
+ if (!response) {
118
+ return {
119
+ evaluator: 'prmd',
120
+ status: 'error',
121
+ reason: 'No response from evaluator',
122
+ duration: Date.now() - start,
123
+ };
124
+ }
125
+
126
+ // Parse PASS/FAIL from response
127
+ return this.parseEvaluatorResponse(response, Date.now() - start);
128
+ } catch (err) {
129
+ return {
130
+ evaluator: 'prmd',
131
+ status: 'error',
132
+ reason: err instanceof Error ? err.message : String(err),
133
+ duration: Date.now() - start,
134
+ };
135
+ }
136
+ }
137
+
138
+ private async resolveEvaluatorContent(assertion: AssertionDef): Promise<string | null> {
139
+ // If prompt: is specified, resolve it (registry ref, local file)
140
+ if (assertion.prompt) {
141
+ return this.resolvePromptTarget(assertion.prompt);
142
+ }
143
+
144
+ // No prompt: field — use the content block of the .test.prmd
145
+ return this.options.evaluatorPrompt || null;
146
+ }
147
+
148
+ private async resolvePromptTarget(prompt: string): Promise<string | null> {
149
+ // Registry reference: @scope/package@version
150
+ if (prompt.startsWith('@')) {
151
+ return this.wrapAsInherits(prompt);
152
+ }
153
+
154
+ // Local file path
155
+ const resolved = path.resolve(this.options.testFileDir, prompt);
156
+ if (!fs.existsSync(resolved)) {
157
+ throw new Error(`Evaluator prompt file not found: ${resolved}`);
158
+ }
159
+
160
+ return fs.readFileSync(resolved, 'utf-8');
161
+ }
162
+
163
+ /**
164
+ * Wrap a registry reference as a minimal .prmd that inherits from the evaluator package.
165
+ * The compiler handles resolution, download, and caching.
166
+ */
167
+ private wrapAsInherits(registryRef: string): string {
168
+ return [
169
+ '---',
170
+ `inherits: "${registryRef}"`,
171
+ 'parameters:',
172
+ ' - name: prompt',
173
+ ' type: string',
174
+ ' - name: response',
175
+ ' type: string',
176
+ ' - name: params',
177
+ ' type: string',
178
+ '---',
179
+ '',
180
+ ].join('\n');
181
+ }
182
+
183
+ private async compileEvaluator(
184
+ cli: CompilerModule,
185
+ content: string,
186
+ context: EvaluatorContext
187
+ ): Promise<string | null> {
188
+ // If content doesn't start with frontmatter, wrap it with minimal frontmatter
189
+ // so the compiler can process it. Content blocks from .test.prmd are raw markdown.
190
+ let prmdContent = content;
191
+ if (!content.trimStart().startsWith('---')) {
192
+ prmdContent = [
193
+ '---',
194
+ 'id: evaluator',
195
+ 'name: "Test Evaluator"',
196
+ 'version: 0.0.1',
197
+ 'parameters:',
198
+ ' - name: prompt',
199
+ ' type: string',
200
+ ' - name: response',
201
+ ' type: string',
202
+ ' - name: params',
203
+ ' type: object',
204
+ '---',
205
+ '',
206
+ content,
207
+ ].join('\n');
208
+ }
209
+
210
+ const memFs = new cli.MemoryFileSystem({ '/evaluator.prmd': prmdContent });
211
+ const compiler = new cli.PrompdCompiler();
212
+
213
+ // Inject evaluation context as template variables
214
+ const parameters: Record<string, string> = {
215
+ prompt: context.prompt,
216
+ response: context.response,
217
+ params: JSON.stringify(context.params, null, 2),
218
+ };
219
+
220
+ // Also expose individual params via dot notation
221
+ for (const [key, value] of Object.entries(context.params)) {
222
+ parameters[`params.${key}`] = String(value);
223
+ }
224
+
225
+ const result = await compiler.compile('/evaluator.prmd', {
226
+ outputFormat: 'markdown',
227
+ parameters,
228
+ fileSystem: memFs,
229
+ workspaceRoot: this.options.workspaceRoot,
230
+ registryUrl: this.options.registryUrl,
231
+ });
232
+
233
+ // CLI compile() may return a string directly or an object
234
+ if (typeof result === 'string') {
235
+ return result || null;
236
+ }
237
+ return result.output || null;
238
+ }
239
+
240
+ private parseEvaluatorResponse(response: string, duration: number): AssertionResult {
241
+ const firstLine = response.trim().split('\n')[0];
242
+ const match = firstLine.match(PASS_FAIL_REGEX);
243
+
244
+ if (!match) {
245
+ return {
246
+ evaluator: 'prmd',
247
+ status: 'error',
248
+ reason: `Evaluator response did not start with PASS or FAIL. Got: "${firstLine.substring(0, 100)}"`,
249
+ duration,
250
+ };
251
+ }
252
+
253
+ const verdict = match[1].toUpperCase();
254
+ const reason = match[2]?.trim() || undefined;
255
+
256
+ return {
257
+ evaluator: 'prmd',
258
+ status: verdict === 'PASS' ? 'pass' : 'fail',
259
+ reason: reason || `Evaluator returned ${verdict}`,
260
+ duration,
261
+ };
262
+ }
263
+
264
+ private getDefaultModel(provider: string): string {
265
+ const defaults: Record<string, string> = {
266
+ openai: 'gpt-4o',
267
+ anthropic: 'claude-sonnet-4-20250514',
268
+ groq: 'llama-3.1-70b-versatile',
269
+ google: 'gemini-2.0-flash',
270
+ mistral: 'mistral-large-latest',
271
+ deepseek: 'deepseek-chat',
272
+ };
273
+ return defaults[provider.toLowerCase()] || 'gpt-4o';
274
+ }
275
+
276
+ private async getCli(): Promise<CompilerModule> {
277
+ if (!this.cliModule) {
278
+ throw new Error(
279
+ '@prompd/cli module not provided. Pass it via PrmdEvaluatorOptions.cliModule'
280
+ );
281
+ }
282
+ return this.cliModule;
283
+ }
284
+ }
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Script Evaluator - runs external scripts with stdin/stdout contract.
3
+ *
4
+ * Contract:
5
+ * - Receives JSON on stdin: { input, output, params, metadata }
6
+ * - Exit code 0 = PASS, 1 = FAIL, other = ERROR
7
+ * - Stdout = reason (optional)
8
+ */
9
+
10
+ import { spawn } from 'child_process';
11
+ import * as path from 'path';
12
+ import * as fs from 'fs';
13
+ import type { Evaluator, EvaluatorContext } from './types';
14
+ import type { AssertionDef, AssertionResult, EvaluateTarget } from '../types';
15
+
16
+ const SCRIPT_TIMEOUT_MS = 30_000;
17
+
18
+ export class ScriptEvaluator implements Evaluator {
19
+ readonly type = 'script';
20
+ private testFileDir: string;
21
+
22
+ constructor(testFileDir: string) {
23
+ this.testFileDir = testFileDir;
24
+ }
25
+
26
+ async evaluate(assertion: AssertionDef, context: EvaluatorContext): Promise<AssertionResult> {
27
+ const start = Date.now();
28
+ const scriptPath = assertion.run;
29
+
30
+ if (!scriptPath) {
31
+ return {
32
+ evaluator: 'script',
33
+ status: 'error',
34
+ reason: 'No "run" path specified for script evaluator',
35
+ duration: Date.now() - start,
36
+ };
37
+ }
38
+
39
+ const resolvedPath = path.resolve(this.testFileDir, scriptPath);
40
+
41
+ if (!fs.existsSync(resolvedPath)) {
42
+ return {
43
+ evaluator: 'script',
44
+ status: 'error',
45
+ reason: `Script not found: ${resolvedPath}`,
46
+ duration: Date.now() - start,
47
+ };
48
+ }
49
+
50
+ // Validate script stays within the test file's directory tree
51
+ const normalizedScript = path.normalize(resolvedPath);
52
+ const normalizedBase = path.normalize(this.testFileDir);
53
+ if (!normalizedScript.startsWith(normalizedBase)) {
54
+ return {
55
+ evaluator: 'script',
56
+ status: 'error',
57
+ reason: `Script path escapes test directory: ${scriptPath}`,
58
+ duration: Date.now() - start,
59
+ };
60
+ }
61
+
62
+ try {
63
+ const result = await this.runScript(resolvedPath, context, assertion);
64
+ return {
65
+ evaluator: 'script',
66
+ status: result.exitCode === 0 ? 'pass' : 'fail',
67
+ reason: result.stdout.trim() || (result.exitCode === 0 ? 'Script passed' : 'Script failed'),
68
+ duration: Date.now() - start,
69
+ };
70
+ } catch (err) {
71
+ return {
72
+ evaluator: 'script',
73
+ status: 'error',
74
+ reason: err instanceof Error ? err.message : String(err),
75
+ duration: Date.now() - start,
76
+ };
77
+ }
78
+ }
79
+
80
+ private runScript(
81
+ scriptPath: string,
82
+ context: EvaluatorContext,
83
+ assertion: AssertionDef
84
+ ): Promise<{ exitCode: number; stdout: string; stderr: string }> {
85
+ return new Promise((resolve, reject) => {
86
+ const { command, args } = this.getRunner(scriptPath);
87
+ const child = spawn(command, args, {
88
+ cwd: this.testFileDir,
89
+ timeout: SCRIPT_TIMEOUT_MS,
90
+ stdio: ['pipe', 'pipe', 'pipe'],
91
+ shell: process.platform === 'win32',
92
+ });
93
+
94
+ let stdout = '';
95
+ let stderr = '';
96
+
97
+ child.stdout.on('data', (data: Buffer) => {
98
+ stdout += data.toString();
99
+ });
100
+
101
+ child.stderr.on('data', (data: Buffer) => {
102
+ stderr += data.toString();
103
+ });
104
+
105
+ child.on('error', (err) => {
106
+ reject(new Error(`Failed to spawn script: ${err.message}`));
107
+ });
108
+
109
+ child.on('close', (code) => {
110
+ if (code === null) {
111
+ reject(new Error('Script process was killed (timeout or signal)'));
112
+ return;
113
+ }
114
+ resolve({ exitCode: code, stdout, stderr });
115
+ });
116
+
117
+ // Send context as JSON on stdin, include target so script knows what to evaluate
118
+ const target: EvaluateTarget = assertion.evaluate || 'response';
119
+ const payload = JSON.stringify({
120
+ target,
121
+ prompt: context.prompt,
122
+ response: context.response,
123
+ params: context.params,
124
+ metadata: context.metadata,
125
+ });
126
+
127
+ child.stdin.write(payload);
128
+ child.stdin.end();
129
+ });
130
+ }
131
+
132
+ private getRunner(scriptPath: string): { command: string; args: string[] } {
133
+ const ext = path.extname(scriptPath).toLowerCase();
134
+
135
+ switch (ext) {
136
+ case '.ts':
137
+ return { command: 'npx', args: ['tsx', scriptPath] };
138
+ case '.js':
139
+ case '.mjs':
140
+ return { command: 'node', args: [scriptPath] };
141
+ case '.py':
142
+ return { command: 'python', args: [scriptPath] };
143
+ case '.sh':
144
+ return { command: 'bash', args: [scriptPath] };
145
+ case '.ps1':
146
+ return { command: 'powershell', args: ['-File', scriptPath] };
147
+ default:
148
+ // For unknown extensions, try running directly (relies on shebang or OS association)
149
+ return { command: scriptPath, args: [] };
150
+ }
151
+ }
152
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Evaluator interfaces for @prompd/test
3
+ */
4
+
5
+ import type { AssertionDef, AssertionResult } from '../types';
6
+
7
+ export interface EvaluatorContext {
8
+ prompt: string;
9
+ response: string;
10
+ params: Record<string, unknown>;
11
+ metadata: {
12
+ provider: string;
13
+ model: string;
14
+ duration: number;
15
+ };
16
+ }
17
+
18
+ export interface Evaluator {
19
+ readonly type: string;
20
+ evaluate(
21
+ assertion: AssertionDef,
22
+ context: EvaluatorContext
23
+ ): Promise<AssertionResult>;
24
+ }