stepproof 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +276 -0
  3. package/dist/adapters/anthropic.d.ts +8 -0
  4. package/dist/adapters/anthropic.d.ts.map +1 -0
  5. package/dist/adapters/anthropic.js +26 -0
  6. package/dist/adapters/anthropic.js.map +1 -0
  7. package/dist/adapters/base.d.ts +4 -0
  8. package/dist/adapters/base.d.ts.map +1 -0
  9. package/dist/adapters/base.js +2 -0
  10. package/dist/adapters/base.js.map +1 -0
  11. package/dist/adapters/index.d.ts +4 -0
  12. package/dist/adapters/index.d.ts.map +1 -0
  13. package/dist/adapters/index.js +13 -0
  14. package/dist/adapters/index.js.map +1 -0
  15. package/dist/adapters/openai.d.ts +8 -0
  16. package/dist/adapters/openai.d.ts.map +1 -0
  17. package/dist/adapters/openai.js +25 -0
  18. package/dist/adapters/openai.js.map +1 -0
  19. package/dist/assertions/engine.d.ts +6 -0
  20. package/dist/assertions/engine.d.ts.map +1 -0
  21. package/dist/assertions/engine.js +124 -0
  22. package/dist/assertions/engine.js.map +1 -0
  23. package/dist/cli.d.ts +3 -0
  24. package/dist/cli.d.ts.map +1 -0
  25. package/dist/cli.js +126 -0
  26. package/dist/cli.js.map +1 -0
  27. package/dist/commands/init.d.ts +2 -0
  28. package/dist/commands/init.d.ts.map +1 -0
  29. package/dist/commands/init.js +39 -0
  30. package/dist/commands/init.js.map +1 -0
  31. package/dist/core/scenario-parser.d.ts +4 -0
  32. package/dist/core/scenario-parser.d.ts.map +1 -0
  33. package/dist/core/scenario-parser.js +92 -0
  34. package/dist/core/scenario-parser.js.map +1 -0
  35. package/dist/core/scenario-runner.d.ts +11 -0
  36. package/dist/core/scenario-runner.d.ts.map +1 -0
  37. package/dist/core/scenario-runner.js +85 -0
  38. package/dist/core/scenario-runner.js.map +1 -0
  39. package/dist/core/types.d.ts +71 -0
  40. package/dist/core/types.d.ts.map +1 -0
  41. package/dist/core/types.js +2 -0
  42. package/dist/core/types.js.map +1 -0
  43. package/dist/reporters/json-reporter.d.ts +4 -0
  44. package/dist/reporters/json-reporter.d.ts.map +1 -0
  45. package/dist/reporters/json-reporter.js +9 -0
  46. package/dist/reporters/json-reporter.js.map +1 -0
  47. package/dist/reporters/junit-reporter.d.ts +3 -0
  48. package/dist/reporters/junit-reporter.d.ts.map +1 -0
  49. package/dist/reporters/junit-reporter.js +34 -0
  50. package/dist/reporters/junit-reporter.js.map +1 -0
  51. package/dist/reporters/sarif-reporter.d.ts +3 -0
  52. package/dist/reporters/sarif-reporter.d.ts.map +1 -0
  53. package/dist/reporters/sarif-reporter.js +47 -0
  54. package/dist/reporters/sarif-reporter.js.map +1 -0
  55. package/dist/reporters/terminal-reporter.d.ts +4 -0
  56. package/dist/reporters/terminal-reporter.d.ts.map +1 -0
  57. package/dist/reporters/terminal-reporter.js +73 -0
  58. package/dist/reporters/terminal-reporter.js.map +1 -0
  59. package/package.json +62 -0
  60. package/schemas/scenario.schema.json +119 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 StanislavBG
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,276 @@
1
+ # stepproof
2
+
3
+ [![Part of Preflight](https://img.shields.io/badge/suite-Preflight-blue)](https://github.com/StanislavBG/agent-gate)
4
+ [![Tests](https://img.shields.io/badge/tests-passing-brightgreen)]()
5
+ [![License](https://img.shields.io/badge/license-MIT-green)]()
6
+
7
+ **Regression testing for multi-step AI workflows. Not observability.**
8
+
9
+ ---
10
+
11
+ You upgraded to `gpt-4o-mini`. Your LangSmith traces look fine. Three days later a customer reports your extraction step stopped working. You found out from a Slack message, not a test.
12
+
13
+ stepproof is what you run before you deploy.
14
+
15
+ ```bash
16
+ npm install -g stepproof
17
+ ```
18
+
19
+ ---
20
+
21
+ ## 30-second quickstart
22
+
23
+ Write a scenario:
24
+
25
+ ```yaml
26
+ # classify.yaml
27
+ name: "Intent classification"
28
+ iterations: 10
29
+
30
+ steps:
31
+ - id: classify
32
+ provider: anthropic
33
+ model: claude-sonnet-4-6
34
+ prompt: "Classify the intent of this message: {{input}}"
35
+ variables:
36
+ input: "I need to cancel my subscription"
37
+ min_pass_rate: 0.90
38
+ assertions:
39
+ - type: contains
40
+ value: "cancel"
41
+ - type: json_schema
42
+ schema: ./schemas/intent.json
43
+
44
+ - id: respond
45
+ provider: openai
46
+ model: gpt-4o
47
+ prompt: "Given intent '{{classify.output}}', write a helpful reply to: {{input}}"
48
+ min_pass_rate: 0.80
49
+ assertions:
50
+ - type: llm_judge
51
+ prompt: "Is this response helpful and on-topic? Answer yes/no."
52
+ pass_on: "yes"
53
+ ```
54
+
55
+ Run it:
56
+
57
+ ```
58
+ stepproof run classify.yaml
59
+ ```
60
+
61
+ Output:
62
+
63
+ ```
64
+ stepproof v0.2.0 — running "Intent classification" (10 iterations)
65
+
66
+ step: classify
67
+ ✓ 9/10 passed (90.0%) — threshold: 90% ✓
68
+
69
+ step: respond
70
+ ✓ 8/10 passed (80.0%) — threshold: 80% ✓
71
+
72
+ All steps passed. Exit 0.
73
+ ```
74
+
75
+ Now break it — swap to a cheaper model, lower the pass rate. It fails:
76
+
77
+ ```
78
+ step: classify
79
+ ✗ 5/10 passed (50.0%) — threshold: 90% ✗
80
+
81
+ 1 step failed. Exit 1.
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Commands
87
+
88
+ ### `stepproof run <scenario>`
89
+
90
+ Run a scenario file or directory of scenarios.
91
+
92
+ ```bash
93
+ stepproof run classify.yaml
94
+ stepproof run scenarios/
95
+ stepproof run scenarios/ --format sarif --output results.sarif
96
+ stepproof run scenarios/ --format junit --output results.xml
97
+ ```
98
+
99
+ Flags:
100
+ - `--format <format>` — output format: `terminal` (default), `sarif`, `junit`
101
+ - `--output <file>` — write output to file instead of stdout
102
+
103
+ ### `stepproof init [dir]`
104
+
105
+ Scaffold a starter scenario in the target directory. Defaults to `./scenarios/`.
106
+
107
+ ```bash
108
+ stepproof init
109
+ # Creates: ./scenarios/first-test.yaml
110
+
111
+ stepproof init my-tests
112
+ # Creates: ./my-tests/first-test.yaml
113
+ ```
114
+
115
+ The generated `first-test.yaml` is a working example you can edit and run immediately.
116
+
117
+ ---
118
+
119
+ ## Environment Variables
120
+
121
+ | Variable | Required | Purpose |
122
+ |----------|----------|---------|
123
+ | `ANTHROPIC_API_KEY` | For Anthropic steps | Authenticates calls to Claude models |
124
+ | `OPENAI_API_KEY` | For OpenAI steps | Authenticates calls to GPT models |
125
+
126
+ Only the keys for the providers you use in your scenarios are required.
127
+
128
+ ---
129
+
130
+ ## CI integration
131
+
132
+ ```yaml
133
+ # .github/workflows/ai-regression.yml
134
+ name: AI regression tests
135
+ on: [push, pull_request]
136
+
137
+ jobs:
138
+ stepproof:
139
+ runs-on: ubuntu-latest
140
+ steps:
141
+ - uses: actions/checkout@v4
142
+ - run: npm install -g stepproof
143
+ - run: stepproof run scenarios/classify.yaml
144
+ env:
145
+ ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
146
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
147
+ ```
148
+
149
+ Exit code 1 on regression. PR blocked. Done.
150
+
151
+ ---
152
+
153
+ ## Assertions
154
+
155
+ | Type | What it checks |
156
+ |------|---------------|
157
+ | `contains` | Output includes this string |
158
+ | `not_contains` | Output does not include this string |
159
+ | `regex` | Output matches this pattern |
160
+ | `json_schema` | Output is valid JSON matching this schema |
161
+ | `llm_judge` | A second LLM call evaluates the output (boolean verdict) |
162
+
163
+ ---
164
+
165
+ ## Structured reports (v0.2.0)
166
+
167
+ stepproof outputs machine-readable SARIF 2.1.0 and JUnit XML for CI pipeline integration.
168
+
169
+ ### SARIF — GitHub Advanced Security / GitLab / Azure DevOps
170
+
171
+ ```bash
172
+ # Write SARIF to stdout
173
+ stepproof run classify.yaml --format sarif
174
+
175
+ # Write SARIF to file
176
+ stepproof run classify.yaml --format sarif --output results.sarif
177
+ ```
178
+
179
+ Integrate with GitHub Advanced Security:
180
+
181
+ ```yaml
182
+ # .github/workflows/ai-regression.yml
183
+ - name: Run stepproof
184
+ run: stepproof run scenarios/ --format sarif --output results.sarif
185
+
186
+ - name: Upload to GitHub Security tab
187
+ uses: github/codeql-action/upload-sarif@v3
188
+ with:
189
+ sarif_file: results.sarif
190
+ if: always()
191
+ ```
192
+
193
+ ### JUnit XML — Jenkins / CircleCI / TeamCity
194
+
195
+ ```bash
196
+ stepproof run classify.yaml --format junit
197
+ stepproof run classify.yaml --format junit --output results.xml
198
+ ```
199
+
200
+ ```yaml
201
+ # .github/workflows/ai-regression.yml
202
+ - name: Run stepproof
203
+ run: stepproof run scenarios/ --format junit --output test-results.xml
204
+
205
+ - name: Publish test results
206
+ uses: actions/upload-artifact@v4
207
+ with:
208
+ name: test-results
209
+ path: test-results.xml
210
+ if: always()
211
+ ```
212
+
213
+ Default output (no `--format` flag) is unchanged — human-readable terminal output.
214
+
215
+ > **Migration note (v0.2.x → v0.3.0):** `--report` still works but is deprecated and will print a warning. Switch to `--format` at your next convenience. `--report` will be removed at v1.0.0.
216
+
217
+ ---
218
+
219
+ ## How this is different from LangSmith / Braintrust / Langfuse
220
+
221
+ | | stepproof | LangSmith / Braintrust |
222
+ |--|-----------|------------------------|
223
+ | When it runs | Before deploy (CI) | After deploy (production) |
224
+ | What it answers | "Is my pipeline still correct?" | "What did my pipeline do?" |
225
+ | Output | Pass/fail with exit code | Traces and dashboards |
226
+ | Use case | Regression testing | Observability |
227
+
228
+ They tell you what happened. We tell you whether to deploy.
229
+
230
+ These are different jobs. Use both.
231
+
232
+ ---
233
+
234
+ ## Scenarios
235
+
236
+ See [`/examples`](./examples) for copy-paste ready scenarios:
237
+ - [`simple-chain.yaml`](./examples/simple-chain.yaml) — basic prompt → response → assertion
238
+ - [`tool-calling.yaml`](./examples/tool-calling.yaml) — verify tool selection and output
239
+ - [`multi-turn.yaml`](./examples/multi-turn.yaml) — conversation with memory, verify consistency
240
+
241
+ ---
242
+
243
+ ## Roadmap
244
+
245
+ - **v0.2.0** (current): YAML scenarios, N iterations, 5 assertion types, exit code 1 on failure, OpenAI + Anthropic, SARIF 2.1.0 + JUnit XML reporters, `stepproof init` scaffolding
246
+ - **v0.3.0** (next): Baseline comparison (fail on regression from last run), GitHub Actions native action, provider comparison mode — run the same scenario against two models and diff the results
247
+ - **Cloud dashboard** (month 3–6): Persistent history, trend charts, team workspaces — never in the CLI
248
+
249
+ ---
250
+
251
+ ## Contributing
252
+
253
+ Issues and PRs welcome. See [CONTRIBUTING.md](./CONTRIBUTING.md) for dev setup and guidelines. The tool is and will remain free. Cloud features are the business model, not the CLI.
254
+
255
+ ---
256
+
257
+ ## Part of the Preflight suite
258
+
259
+ stepproof is one tool in the **Preflight** AI Agent DevOps suite — local-first CLIs covering the full lifecycle from pre-deploy validation to production observability:
260
+
261
+ | Tool | Purpose | Install |
262
+ |------|---------|---------|
263
+ | **stepproof** | Behavioral regression testing | `npm install -g stepproof` |
264
+ | **agent-comply** | EU AI Act compliance scanning | `npm install -g agent-comply` |
265
+ | **agent-gate** | Unified pre-deploy CI gate | `npm install -g agent-gate` |
266
+ | **agent-shift** | Config versioning + environment promotion | `npm install -g agent-shift` |
267
+ | **agent-trace** | Local observability — OTel traces in SQLite | `npm install -g agent-trace` |
268
+
269
+ Install the full suite:
270
+ ```bash
271
+ npm install -g agent-gate stepproof agent-comply agent-shift agent-trace
272
+ ```
273
+
274
+ ---
275
+
276
+ *stepproof — because "I checked manually before the deploy" is not a test.*
@@ -0,0 +1,8 @@
1
+ import type { ProviderAdapter } from './base.js';
2
+ export declare class AnthropicAdapter implements ProviderAdapter {
3
+ private client;
4
+ private model;
5
+ constructor(model: string);
6
+ call(prompt: string, system?: string): Promise<string>;
7
+ }
8
+ //# sourceMappingURL=anthropic.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"anthropic.d.ts","sourceRoot":"","sources":["../../src/adapters/anthropic.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,qBAAa,gBAAiB,YAAW,eAAe;IACtD,OAAO,CAAC,MAAM,CAAY;IAC1B,OAAO,CAAC,KAAK,CAAS;gBAEV,KAAK,EAAE,MAAM;IAQnB,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;CAe7D"}
@@ -0,0 +1,26 @@
1
+ import Anthropic from '@anthropic-ai/sdk';
2
+ export class AnthropicAdapter {
3
+ client;
4
+ model;
5
+ constructor(model) {
6
+ this.model = model;
7
+ if (!process.env.ANTHROPIC_API_KEY) {
8
+ throw new Error('ANTHROPIC_API_KEY environment variable is required for Anthropic provider');
9
+ }
10
+ this.client = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
11
+ }
12
+ async call(prompt, system) {
13
+ const response = await this.client.messages.create({
14
+ model: this.model,
15
+ max_tokens: 1024,
16
+ ...(system && { system }),
17
+ messages: [{ role: 'user', content: prompt }],
18
+ });
19
+ const content = response.content[0];
20
+ if (content?.type === 'text') {
21
+ return content.text;
22
+ }
23
+ return '';
24
+ }
25
+ }
26
+ //# sourceMappingURL=anthropic.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"anthropic.js","sourceRoot":"","sources":["../../src/adapters/anthropic.ts"],"names":[],"mappings":"AAAA,OAAO,SAAS,MAAM,mBAAmB,CAAC;AAG1C,MAAM,OAAO,gBAAgB;IACnB,MAAM,CAAY;IAClB,KAAK,CAAS;IAEtB,YAAY,KAAa;QACvB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC;YACnC,MAAM,IAAI,KAAK,CAAC,2EAA2E,CAAC,CAAC;QAC/F,CAAC;QACD,IAAI,CAAC,MAAM,GAAG,IAAI,SAAS,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC;IACzE,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,MAAc,EAAE,MAAe;QACxC,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;YACjD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,UAAU,EAAE,IAAI;YAChB,GAAG,CAAC,MAAM,IAAI,EAAE,MAAM,EAAE,CAAC;YACzB,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;SAC9C,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QACpC,IAAI,OAAO,EAAE,IAAI,KAAK,MAAM,EAAE,CAAC;YAC7B,OAAO,OAAO,CAAC,IAAI,CAAC;QACtB,CAAC;QAED,OAAO,EAAE,CAAC;IACZ,CAAC;CACF"}
@@ -0,0 +1,4 @@
1
+ export interface ProviderAdapter {
2
+ call(prompt: string, system?: string): Promise<string>;
3
+ }
4
+ //# sourceMappingURL=base.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"base.d.ts","sourceRoot":"","sources":["../../src/adapters/base.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,eAAe;IAC9B,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;CACxD"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=base.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"base.js","sourceRoot":"","sources":["../../src/adapters/base.ts"],"names":[],"mappings":""}
@@ -0,0 +1,4 @@
1
+ import type { ProviderAdapter } from './base.js';
2
+ export declare function getAdapter(provider: string, model: string): ProviderAdapter;
3
+ export type { ProviderAdapter };
4
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/adapters/index.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,wBAAgB,UAAU,CAAC,QAAQ,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,eAAe,CAS3E;AAED,YAAY,EAAE,eAAe,EAAE,CAAC"}
@@ -0,0 +1,13 @@
1
+ import { OpenAIAdapter } from './openai.js';
2
+ import { AnthropicAdapter } from './anthropic.js';
3
+ export function getAdapter(provider, model) {
4
+ switch (provider) {
5
+ case 'openai':
6
+ return new OpenAIAdapter(model);
7
+ case 'anthropic':
8
+ return new AnthropicAdapter(model);
9
+ default:
10
+ throw new Error(`Unknown provider: "${provider}". Supported providers: openai, anthropic`);
11
+ }
12
+ }
13
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/adapters/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAC5C,OAAO,EAAE,gBAAgB,EAAE,MAAM,gBAAgB,CAAC;AAGlD,MAAM,UAAU,UAAU,CAAC,QAAgB,EAAE,KAAa;IACxD,QAAQ,QAAQ,EAAE,CAAC;QACjB,KAAK,QAAQ;YACX,OAAO,IAAI,aAAa,CAAC,KAAK,CAAC,CAAC;QAClC,KAAK,WAAW;YACd,OAAO,IAAI,gBAAgB,CAAC,KAAK,CAAC,CAAC;QACrC;YACE,MAAM,IAAI,KAAK,CAAC,sBAAsB,QAAQ,2CAA2C,CAAC,CAAC;IAC/F,CAAC;AACH,CAAC"}
@@ -0,0 +1,8 @@
1
+ import type { ProviderAdapter } from './base.js';
2
+ export declare class OpenAIAdapter implements ProviderAdapter {
3
+ private client;
4
+ private model;
5
+ constructor(model: string);
6
+ call(prompt: string, system?: string): Promise<string>;
7
+ }
8
+ //# sourceMappingURL=openai.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"openai.d.ts","sourceRoot":"","sources":["../../src/adapters/openai.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,qBAAa,aAAc,YAAW,eAAe;IACnD,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;gBAEV,KAAK,EAAE,MAAM;IAQnB,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;CAe7D"}
@@ -0,0 +1,25 @@
1
+ import OpenAI from 'openai';
2
+ export class OpenAIAdapter {
3
+ client;
4
+ model;
5
+ constructor(model) {
6
+ this.model = model;
7
+ if (!process.env.OPENAI_API_KEY) {
8
+ throw new Error('OPENAI_API_KEY environment variable is required for OpenAI provider');
9
+ }
10
+ this.client = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
11
+ }
12
+ async call(prompt, system) {
13
+ const messages = [];
14
+ if (system) {
15
+ messages.push({ role: 'system', content: system });
16
+ }
17
+ messages.push({ role: 'user', content: prompt });
18
+ const response = await this.client.chat.completions.create({
19
+ model: this.model,
20
+ messages,
21
+ });
22
+ return response.choices[0]?.message?.content ?? '';
23
+ }
24
+ }
25
+ //# sourceMappingURL=openai.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"openai.js","sourceRoot":"","sources":["../../src/adapters/openai.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAG5B,MAAM,OAAO,aAAa;IAChB,MAAM,CAAS;IACf,KAAK,CAAS;IAEtB,YAAY,KAAa;QACvB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,EAAE,CAAC;YAChC,MAAM,IAAI,KAAK,CAAC,qEAAqE,CAAC,CAAC;QACzF,CAAC;QACD,IAAI,CAAC,MAAM,GAAG,IAAI,MAAM,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,cAAc,EAAE,CAAC,CAAC;IACnE,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,MAAc,EAAE,MAAe;QACxC,MAAM,QAAQ,GAA6C,EAAE,CAAC;QAE9D,IAAI,MAAM,EAAE,CAAC;YACX,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;QACrD,CAAC;QACD,QAAQ,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC;QAEjD,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,CAAC;YACzD,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,QAAQ;SACT,CAAC,CAAC;QAEH,OAAO,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,OAAO,EAAE,OAAO,IAAI,EAAE,CAAC;IACrD,CAAC;CACF"}
@@ -0,0 +1,6 @@
1
+ import type { Assertion, AssertionResult } from '../core/types.js';
2
+ export declare function runAssertions(output: string, assertions: Assertion[], scenarioDir: string): Promise<{
3
+ results: AssertionResult[];
4
+ allPassed: boolean;
5
+ }>;
6
+ //# sourceMappingURL=engine.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"engine.d.ts","sourceRoot":"","sources":["../../src/assertions/engine.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAInE,wBAAsB,aAAa,CACjC,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,SAAS,EAAE,EACvB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC;IAAE,OAAO,EAAE,eAAe,EAAE,CAAC;IAAC,SAAS,EAAE,OAAO,CAAA;CAAE,CAAC,CAU7D"}
@@ -0,0 +1,124 @@
1
+ import * as fs from 'node:fs';
2
+ import * as path from 'node:path';
3
+ import { Ajv as AjvClass } from 'ajv';
4
+ import { getAdapter } from '../adapters/index.js';
5
+ const ajv = new AjvClass({ allErrors: true });
6
+ export async function runAssertions(output, assertions, scenarioDir) {
7
+ const results = [];
8
+ for (const assertion of assertions) {
9
+ const result = await runAssertion(output, assertion, scenarioDir);
10
+ results.push(result);
11
+ }
12
+ const allPassed = results.every((r) => r.passed);
13
+ return { results, allPassed };
14
+ }
15
+ async function runAssertion(output, assertion, scenarioDir) {
16
+ const { type } = assertion;
17
+ switch (type) {
18
+ case 'contains': {
19
+ if (!assertion.value) {
20
+ return fail(type, 'Missing required field "value"');
21
+ }
22
+ const passed = output.toLowerCase().includes(assertion.value.toLowerCase());
23
+ return passed
24
+ ? pass(type)
25
+ : fail(type, `Expected output to contain: "${assertion.value}"`);
26
+ }
27
+ case 'not_contains': {
28
+ if (!assertion.value) {
29
+ return fail(type, 'Missing required field "value"');
30
+ }
31
+ const passed = !output.toLowerCase().includes(assertion.value.toLowerCase());
32
+ return passed
33
+ ? pass(type)
34
+ : fail(type, `Expected output NOT to contain: "${assertion.value}"`);
35
+ }
36
+ case 'regex': {
37
+ if (!assertion.value) {
38
+ return fail(type, 'Missing required field "value" (regex pattern)');
39
+ }
40
+ let regex;
41
+ try {
42
+ regex = new RegExp(assertion.value, 'i');
43
+ }
44
+ catch (e) {
45
+ return fail(type, `Invalid regex pattern: "${assertion.value}"`);
46
+ }
47
+ const passed = regex.test(output);
48
+ return passed
49
+ ? pass(type)
50
+ : fail(type, `Output did not match pattern: ${assertion.value}`);
51
+ }
52
+ case 'json_schema': {
53
+ if (!assertion.schema) {
54
+ return fail(type, 'Missing required field "schema" (path to JSON schema file)');
55
+ }
56
+ let parsed;
57
+ try {
58
+ parsed = JSON.parse(output);
59
+ }
60
+ catch {
61
+ return fail(type, `Output is not valid JSON`);
62
+ }
63
+ const schemaPath = path.resolve(scenarioDir, assertion.schema);
64
+ let schema;
65
+ try {
66
+ schema = JSON.parse(fs.readFileSync(schemaPath, 'utf-8'));
67
+ }
68
+ catch (e) {
69
+ return fail(type, `Cannot read schema file: ${assertion.schema}`);
70
+ }
71
+ let validate;
72
+ try {
73
+ validate = ajv.compile(schema);
74
+ }
75
+ catch (e) {
76
+ return fail(type, `Invalid JSON schema: ${e.message}`);
77
+ }
78
+ const valid = validate(parsed);
79
+ if (valid) {
80
+ return pass(type);
81
+ }
82
+ const errors = ajv.errorsText(validate.errors, { separator: '; ' });
83
+ return fail(type, `Schema validation failed: ${errors}`);
84
+ }
85
+ case 'llm_judge': {
86
+ if (!assertion.prompt) {
87
+ return fail(type, 'Missing required field "prompt" for llm_judge assertion');
88
+ }
89
+ const passOn = (assertion.pass_on ?? 'yes').toLowerCase().trim();
90
+ const provider = assertion.provider ?? 'anthropic';
91
+ const model = assertion.model ?? (provider === 'anthropic' ? 'claude-haiku-4-5-20251001' : 'gpt-4o-mini');
92
+ let adapter;
93
+ try {
94
+ adapter = getAdapter(provider, model);
95
+ }
96
+ catch (e) {
97
+ return fail(type, `Cannot create LLM judge adapter: ${e.message}`);
98
+ }
99
+ const judgePrompt = `${assertion.prompt}\n\nText to evaluate:\n---\n${output}\n---\n\nAnswer with a single word.`;
100
+ let judgeResponse;
101
+ try {
102
+ judgeResponse = await adapter.call(judgePrompt);
103
+ }
104
+ catch (e) {
105
+ return fail(type, `LLM judge API call failed: ${e.message}`);
106
+ }
107
+ const normalizedResponse = judgeResponse.trim().toLowerCase();
108
+ const passed = normalizedResponse.startsWith(passOn);
109
+ return passed
110
+ ? pass(type)
111
+ : fail(type, `LLM judge responded "${judgeResponse.trim()}" (expected to start with: "${passOn}")`);
112
+ }
113
+ default: {
114
+ return fail(type, `Unknown assertion type: "${type}". Valid types: contains, not_contains, regex, json_schema, llm_judge`);
115
+ }
116
+ }
117
+ }
118
+ function pass(type) {
119
+ return { type, passed: true };
120
+ }
121
+ function fail(type, message) {
122
+ return { type, passed: false, message };
123
+ }
124
+ //# sourceMappingURL=engine.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"engine.js","sourceRoot":"","sources":["../../src/assertions/engine.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,SAAS,CAAC;AAC9B,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAClC,OAAO,EAAE,GAAG,IAAI,QAAQ,EAAE,MAAM,KAAK,CAAC;AAEtC,OAAO,EAAE,UAAU,EAAE,MAAM,sBAAsB,CAAC;AAGlD,MAAM,GAAG,GAAG,IAAI,QAAQ,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;AAE9C,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,MAAc,EACd,UAAuB,EACvB,WAAmB;IAEnB,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,MAAM,MAAM,GAAG,MAAM,YAAY,CAAC,MAAM,EAAE,SAAS,EAAE,WAAW,CAAC,CAAC;QAClE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACvB,CAAC;IAED,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IACjD,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;AAChC,CAAC;AAED,KAAK,UAAU,YAAY,CACzB,MAAc,EACd,SAAoB,EACpB,WAAmB;IAEnB,MAAM,EAAE,IAAI,EAAE,GAAG,SAAS,CAAC;IAE3B,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;gBACrB,OAAO,IAAI,CAAC,IAAI,EAAE,gCAAgC,CAAC,CAAC;YACtD,CAAC;YACD,MAAM,MAAM,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;YAC5E,OAAO,MAAM;gBACX,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;gBACZ,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,gCAAgC,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;QACrE,CAAC;QAED,KAAK,cAAc,CAAC,CAAC,CAAC;YACpB,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;gBACrB,OAAO,IAAI,CAAC,IAAI,EAAE,gCAAgC,CAAC,CAAC;YACtD,CAAC;YACD,MAAM,MAAM,GAAG,CAAC,MAAM,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,SAAS,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,CAAC;YAC7E,OAAO,MAAM;gBACX,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;gBACZ,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,oCAAoC,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;QACzE,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,IAAI,CAAC,SAAS,CAAC,KAAK,EAAE,CAAC;gBACrB,OAAO,IAAI,CAAC,IAAI,EAAE,gDAAgD,CAAC,CAAC;YACtE,CAAC;YACD,IAAI,KAAa,CAAC;YAClB,IAAI,CAAC;gBACH,KAAK,GAAG,IAAI,MAAM,CAAC,SAAS,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;YAC3C,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,IAAI,CAAC,IAAI,EAAE,2BAA2B,SAAS,CAAC,KAAK,GAAG,CAAC,CAAC;YACnE,CAAC;YACD,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;YAClC,OAAO,MAAM;gBACX,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;gBACZ,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,iCAAiC,SAAS,CAAC,KAAK,EAAE,CAAC,CAAC;QACrE,CAAC;QAED,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;gBACtB,OAAO,IAAI,CAAC,IAAI,EAAE,4DAA4D,CAAC,CAAC;YAClF,CAAC;YAED,IAAI,MAAe,CAAC;YACpB,IAAI,CAAC;gBACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YAC9B,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,IAAI,CAAC,IAAI,EAAE,0BAA0B,CAAC,CAAC;YAChD,CAAC;YAED,MAAM,UAAU,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC;YAC/D,IAAI,MAAc,CAAC;YACnB,IAAI,CAAC;gBACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC,CAAC;YAC5D,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,IAAI,CAAC,IAAI,EAAE,4BAA4B,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC;YACpE,CAAC;YAED,IAAI,QAA0B,CAAC;YAC/B,IAAI,CAAC;gBACH,QAAQ,GAAG,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;YACjC,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,IAAI,CAAC,IAAI,EAAE,wBAAyB,CAAW,CAAC,OAAO,EAAE,CAAC,CAAC;YACpE,CAAC;YAED,MAAM,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC;YAC/B,IAAI,KAAK,EAAE,CAAC;gBACV,OAAO,IAAI,CAAC,IAAI,CAAC,CAAC;YACpB,CAAC;YACD,MAAM,MAAM,GAAG,GAAG,CAAC,UAAU,CAAC,QAAQ,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;YACpE,OAAO,IAAI,CAAC,IAAI,EAAE,6BAA6B,MAAM,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;gBACtB,OAAO,IAAI,CAAC,IAAI,EAAE,yDAAyD,CAAC,CAAC;YAC/E,CAAC;YAED,MAAM,MAAM,GAAG,CAAC,SAAS,CAAC,OAAO,IAAI,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,CAAC;YACjE,MAAM,QAAQ,GAAG,SAAS,CAAC,QAAQ,IAAI,WAAW,CAAC;YACnD,MAAM,KAAK,GAAG,SAAS,CAAC,KAAK,IAAI,CAAC,QAAQ,KAAK,WAAW,CAAC,CAAC,CAAC,2BAA2B,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC;YAE1G,IAAI,OAAO,CAAC;YACZ,IAAI,CAAC;gBACH,OAAO,GAAG,UAAU,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;YACxC,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,IAAI,CAAC,IAAI,EAAE,oCAAqC,CAAW,CAAC,OAAO,EAAE,CAAC,CAAC;YAChF,CAAC;YAED,MAAM,WAAW,GAAG,GAAG,SAAS,CAAC,MAAM,+BAA+B,MAAM,qCAAqC,CAAC;YAElH,IAAI,aAAqB,CAAC;YAC1B,IAAI,CAAC;gBACH,aAAa,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;YAClD,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,IAAI,CAAC,IAAI,EAAE,8BAA+B,CAAW,CAAC,OAAO,EAAE,CAAC,CAAC;YAC1E,CAAC;YAED,MAAM,kBAAkB,GAAG,aAAa,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YAC9D,MAAM,MAAM,GAAG,kBAAkB,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC;YAErD,OAAO,MAAM;gBACX,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC;gBACZ,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,wBAAwB,aAAa,CAAC,IAAI,EAAE,+BAA+B,MAAM,IAAI,CAAC,CAAC;QACxG,CAAC;QAED,OAAO,CAAC,CAAC,CAAC;YACR,OAAO,IAAI,CAAC,IAAc,EAAE,4BAA4B,IAAI,uEAAuE,CAAC,CAAC;QACvI,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,IAAI,CAAC,IAAY;IACxB,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,CAAC;AAChC,CAAC;AAED,SAAS,IAAI,CAAC,IAAY,EAAE,OAAe;IACzC,OAAO,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,OAAO,EAAE,CAAC;AAC1C,CAAC"}
package/dist/cli.d.ts ADDED
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env node
2
+ export {};
3
+ //# sourceMappingURL=cli.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}