prompt-lock 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/CHANGELOG.md +36 -0
  2. package/README.md +161 -4
  3. package/dist/assertions/builtin.d.ts.map +1 -1
  4. package/dist/assertions/builtin.js +11 -0
  5. package/dist/assertions/builtin.js.map +1 -1
  6. package/dist/assertions/index.d.ts.map +1 -1
  7. package/dist/assertions/index.js +5 -0
  8. package/dist/assertions/index.js.map +1 -1
  9. package/dist/assertions/llm-judge.d.ts +6 -0
  10. package/dist/assertions/llm-judge.d.ts.map +1 -0
  11. package/dist/assertions/llm-judge.js +75 -0
  12. package/dist/assertions/llm-judge.js.map +1 -0
  13. package/dist/cli.js +114 -11
  14. package/dist/cli.js.map +1 -1
  15. package/dist/config-loader.d.ts +4 -0
  16. package/dist/config-loader.d.ts.map +1 -0
  17. package/dist/config-loader.js +85 -0
  18. package/dist/config-loader.js.map +1 -0
  19. package/dist/config-validation.d.ts.map +1 -1
  20. package/dist/config-validation.js +33 -3
  21. package/dist/config-validation.js.map +1 -1
  22. package/dist/dataset-loader.d.ts +2 -0
  23. package/dist/dataset-loader.d.ts.map +1 -0
  24. package/dist/dataset-loader.js +130 -0
  25. package/dist/dataset-loader.js.map +1 -0
  26. package/dist/index.d.ts +6 -3
  27. package/dist/index.d.ts.map +1 -1
  28. package/dist/index.js +18 -1
  29. package/dist/index.js.map +1 -1
  30. package/dist/pricing.d.ts +9 -0
  31. package/dist/pricing.d.ts.map +1 -0
  32. package/dist/pricing.js +49 -0
  33. package/dist/pricing.js.map +1 -0
  34. package/dist/providers/anthropic.d.ts.map +1 -1
  35. package/dist/providers/anthropic.js +49 -36
  36. package/dist/providers/anthropic.js.map +1 -1
  37. package/dist/providers/index.d.ts.map +1 -1
  38. package/dist/providers/index.js +7 -1
  39. package/dist/providers/index.js.map +1 -1
  40. package/dist/providers/openai.d.ts.map +1 -1
  41. package/dist/providers/openai.js +48 -35
  42. package/dist/providers/openai.js.map +1 -1
  43. package/dist/reporter.d.ts +4 -1
  44. package/dist/reporter.d.ts.map +1 -1
  45. package/dist/reporter.js +141 -0
  46. package/dist/reporter.js.map +1 -1
  47. package/dist/runner.d.ts +2 -1
  48. package/dist/runner.d.ts.map +1 -1
  49. package/dist/runner.js +136 -20
  50. package/dist/runner.js.map +1 -1
  51. package/dist/types.d.ts +41 -2
  52. package/dist/types.d.ts.map +1 -1
  53. package/package.json +5 -2
  54. package/schemas/promptlock.schema.json +271 -0
package/CHANGELOG.md CHANGED
@@ -5,6 +5,42 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.4.0] - 2026-04-09
9
+
10
+ ### Added
11
+
12
+ - **A/B testing mode** — compare two prompt variants side-by-side with `prompt-lock run --ab v1:v2`. Shows cost, latency, token, and pass-rate deltas; picks a winner based on pass rate → cost (5% threshold) → latency (10% threshold).
13
+ - **`runAB()` programmatic API** — call `runAB(variantA, variantB, opts)` to get an `ABComparisonResult` with winner and deltas.
14
+ - **A/B markdown reports** — `generateABMarkdownReport()` writes comparison tables to `.promptlock/reports/ab-<timestamp>.md`, ideal for PR comments.
15
+ - **JSON Schema for configs** — `schemas/promptlock.schema.json` ships with the package. YAML/JSON users get IDE autocomplete and validation by adding:
16
+ ```yaml
17
+ # yaml-language-server: $schema=https://raw.githubusercontent.com/shmulikdav/Promptlock/main/schemas/promptlock.schema.json
18
+ ```
19
+
20
+ ### Changed
21
+
22
+ - `files` field in package.json now includes `schemas/` directory so the schema ships with npm.
23
+
24
+ ## [0.3.0] - 2026-04-05
25
+
26
+ ### Added
27
+
28
+ - **LLM-as-judge assertion** (`llm-judge`) — use a separate LLM to evaluate output quality with configurable criteria and threshold
29
+ - **YAML/JSON config files** — auto-discovers `promptlock.yaml`/`.yml`/`.json` in project root; no more JS-only configs
30
+ - **CSV/JSON dataset import** — `dataset: './data/test-inputs.csv'` loads external test data files
31
+ - **Cost & token tracking** — captures token usage from OpenAI and Anthropic responses, estimates cost via built-in pricing table
32
+ - **`max-cost` assertion** — fail if a prompt exceeds a dollar threshold (e.g. `{ type: 'max-cost', dollars: 0.05 }`)
33
+ - **Markdown report format** — `--report markdown` generates GitHub-flavored markdown tables
34
+ - **Watch mode** (`--watch`) — auto-reruns on config/prompt file changes with debounce
35
+ - Cost summary in console, JSON, and HTML reports
36
+ - `estimateCost()`, `getPricingTable()`, `loadDataset()`, `loadConfigFile()`, `discoverConfigFile()` exports
37
+
38
+ ### Changed
39
+
40
+ - `LLMProvider` interface now supports optional `callWithMeta()` for token usage (backward compatible)
41
+ - `dataset` field accepts file path strings (`.csv`, `.json`) in addition to inline arrays
42
+ - Config loader scans for `.yaml`/`.yml` files in `prompts/` directory
43
+
8
44
  ## [0.2.0] - 2026-04-05
9
45
 
10
46
  ### Added
package/README.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # prompt-lock
2
2
 
3
+ [![CI](https://github.com/shmulikdav/Promptlock/actions/workflows/ci.yml/badge.svg)](https://github.com/shmulikdav/Promptlock/actions/workflows/ci.yml)
4
+ [![npm version](https://img.shields.io/npm/v/prompt-lock)](https://www.npmjs.com/package/prompt-lock)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
3
7
  **Version control and behavioral regression testing for LLM prompts.**
4
8
 
5
9
  prompt-lock wraps your prompts with behavioral assertions and snapshot baselines. On every change, it runs the assertion suite and flags regressions — like Jest for LLM behavior.
@@ -57,7 +61,33 @@ npx prompt-lock snapshot
57
61
 
58
62
  ## Defining Prompts
59
63
 
60
- Create `.js` files in your `prompts/` directory. You can export a single config or an array of configs:
64
+ Create config files as `.js`, `.yaml`, `.yml`, or `.json`. prompt-lock auto-discovers `promptlock.yaml` in your project root, or scans the `prompts/` directory.
65
+
66
+ ### YAML Config (recommended)
67
+
68
+ ```yaml
69
+ # promptlock.yaml
70
+ id: article-summarizer
71
+ version: '1.0.0'
72
+ provider: anthropic
73
+ model: claude-sonnet-4-20250514
74
+ prompt: |
75
+ Summarize the following article in 3 bullet points.
76
+ Article: {{article}}
77
+ defaultVars:
78
+ article: The quick brown fox jumped over the lazy dog.
79
+ assertions:
80
+ - type: contains
81
+ value: '•'
82
+ - type: max-length
83
+ chars: 500
84
+ - type: max-cost
85
+ dollars: 0.05
86
+ ```
87
+
88
+ ### JavaScript Config
89
+
90
+ Create `.js` files in your `prompts/` directory:
61
91
 
62
92
  ```javascript
63
93
  // prompts/summarizer.js
@@ -108,7 +138,7 @@ module.exports = {
108
138
 
109
139
  ### Testing with Datasets
110
140
 
111
- Test a prompt against multiple inputs:
141
+ Test a prompt against multiple inputs — inline or from external CSV/JSON files:
112
142
 
113
143
  ```javascript
114
144
  module.exports = {
@@ -117,11 +147,18 @@ module.exports = {
117
147
  model: 'gpt-4o-mini',
118
148
  prompt: 'Classify this ticket: {{ticket}}',
119
149
  defaultVars: { ticket: 'My payment failed' },
150
+
151
+ // Inline dataset
120
152
  dataset: [
121
153
  { ticket: 'My payment failed' },
122
154
  { ticket: 'How do I reset my password?' },
123
155
  { ticket: 'Your product is great!' },
124
156
  ],
157
+
158
+ // Or load from a file:
159
+ // dataset: './data/test-tickets.csv',
160
+ // dataset: './data/test-tickets.json',
161
+
125
162
  assertions: [
126
163
  { type: 'json-valid' },
127
164
  { type: 'max-latency', ms: 5000 },
@@ -129,6 +166,38 @@ module.exports = {
129
166
  };
130
167
  ```
131
168
 
169
+ CSV files use the first row as headers (= template variable names):
170
+
171
+ ```csv
172
+ ticket,expected_category
173
+ My payment failed,billing
174
+ How do I reset my password?,account
175
+ Your product is great!,feedback
176
+ ```
177
+
178
+ ### LLM-as-Judge
179
+
180
+ Use a separate LLM to evaluate output quality:
181
+
182
+ ```javascript
183
+ module.exports = {
184
+ id: 'creative-writer',
185
+ provider: 'anthropic',
186
+ model: 'claude-sonnet-4-20250514',
187
+ prompt: 'Write a haiku about {{topic}}',
188
+ defaultVars: { topic: 'coding' },
189
+ assertions: [
190
+ { type: 'max-length', chars: 200 },
191
+ {
192
+ type: 'llm-judge',
193
+ judge: { provider: 'openai', model: 'gpt-4o-mini' },
194
+ criteria: 'Is this a valid haiku with 5-7-5 syllable structure?',
195
+ threshold: 0.7, // pass if score >= 0.7
196
+ },
197
+ ],
198
+ };
199
+ ```
200
+
132
201
  ## Programmatic Usage
133
202
 
134
203
  ```typescript
@@ -167,12 +236,15 @@ prompt-lock run --id my-prompt # Run a specific prompt
167
236
  prompt-lock run --ci # Exit code 1 on failure
168
237
  prompt-lock run --report html # Generate HTML report
169
238
  prompt-lock run --report json # Generate JSON report
170
- prompt-lock run --report both # Generate both reports
239
+ prompt-lock run --report markdown # Generate Markdown report
240
+ prompt-lock run --report both # Generate JSON + HTML reports
171
241
  prompt-lock run --dry-run # Show what would be tested without calling LLMs
172
242
  prompt-lock run --verbose # Show detailed output per prompt
173
243
  prompt-lock run --parallel # Run prompts in parallel
174
244
  prompt-lock run --concurrency 10 # Max concurrent runs (default: 5)
175
245
  prompt-lock run --cache # Cache LLM outputs (skip unchanged prompts)
246
+ prompt-lock run --watch # Watch for file changes and re-run
247
+ prompt-lock run --ab v1:v2 # A/B compare two prompt IDs side-by-side
176
248
  prompt-lock run --github-pr owner/repo#123 # Post results as PR comment
177
249
  ```
178
250
 
@@ -226,6 +298,89 @@ prompt-lock run --cache
226
298
  prompt-lock cache clear
227
299
  ```
228
300
 
301
+ ### A/B Testing Mode
302
+
303
+ Compare two prompt variants side-by-side and pick a winner:
304
+
305
+ ```bash
306
+ prompt-lock run --ab summarizer-v1:summarizer-v2
307
+ ```
308
+
309
+ Output:
310
+
311
+ ```
312
+ A/B Comparison: summarizer-v1 vs summarizer-v2
313
+
314
+ | Metric | Variant A | Variant B | Delta |
315
+ | ------- | ----------------- | ----------------- | --------- |
316
+ | Status | ✅ 5/5 passed | ✅ 5/5 passed | — |
317
+ | Latency | 1250ms | 980ms | -270ms |
318
+ | Cost | $0.004500 | $0.003200 | -$0.00130 |
319
+ | Tokens | 1234 | 987 | -247 |
320
+
321
+ Winner: Variant B
322
+ ```
323
+
324
+ **Winner logic:**
325
+ 1. Higher pass rate wins
326
+ 2. Otherwise: >5% cheaper wins
327
+ 3. Otherwise: >10% faster wins
328
+ 4. Otherwise: tie
329
+
330
+ For signal over noise, use a `dataset` of size 5+ when A/B testing — a single LLM call isn't statistically meaningful.
331
+
332
+ Programmatic API:
333
+
334
+ ```typescript
335
+ import { runAB } from 'prompt-lock';
336
+
337
+ const result = await runAB(variantA, variantB);
338
+ console.log(result.winner); // 'A' | 'B' | 'tie'
339
+ console.log(result.deltas.costDollars);
340
+ ```
341
+
342
+ ### YAML Autocomplete (JSON Schema)
343
+
344
+ Get IDE autocomplete, validation, and inline docs for YAML configs. Add this comment at the top of your `promptlock.yaml`:
345
+
346
+ ```yaml
347
+ # yaml-language-server: $schema=https://raw.githubusercontent.com/shmulikdav/Promptlock/main/schemas/promptlock.schema.json
348
+ id: my-prompt
349
+ provider: anthropic
350
+ model: claude-sonnet-4-20250514
351
+ # VS Code now suggests all valid fields and assertion types
352
+ ```
353
+
354
+ Install the [YAML extension](https://marketplace.visualstudio.com/items?itemName=redhat.vscode-yaml) in VS Code to enable schema support.
355
+
356
+ ### Cost & Token Tracking
357
+
358
+ prompt-lock automatically tracks token usage and estimates cost for OpenAI and Anthropic calls. Costs are shown in console output and included in all report formats.
359
+
360
+ ```bash
361
+ prompt-lock run --verbose # Shows per-prompt token counts and cost
362
+ ```
363
+
364
+ Use the `max-cost` assertion to enforce budget limits:
365
+
366
+ ```yaml
367
+ assertions:
368
+ - type: max-cost
369
+ dollars: 0.05
370
+ ```
371
+
372
+ Built-in pricing for GPT-4o, GPT-4o-mini, Claude Sonnet 4, Claude Haiku, and more. Unknown models show zero cost.
373
+
374
+ ### Watch Mode
375
+
376
+ Auto-rerun on file changes during prompt development:
377
+
378
+ ```bash
379
+ prompt-lock run --watch
380
+ ```
381
+
382
+ Watches your config files and `prompts/` directory. Debounces rapid changes (500ms).
383
+
229
384
  ### Retry Logic
230
385
 
231
386
  All LLM provider calls automatically retry on transient errors (rate limits, timeouts, network errors) with exponential backoff. Default: 3 retries. Use `--verbose` to see retry activity.
@@ -258,7 +413,9 @@ This posts a markdown table with pass/fail results and failure details. If a com
258
413
  | `no-hallucination-words` | `words?: string[]` | Output does NOT contain hallucination indicators |
259
414
  | `no-duplicates` | `separator?: string` | Output has no duplicate items (split by separator, default `\n`) |
260
415
  | `max-latency` | `ms: number` | LLM response time is under N milliseconds |
261
- | `custom` | `name: string, fn: (output) => boolean` | User-provided function returning boolean |
416
+ | `max-cost` | `dollars: number` | LLM call cost is under N dollars |
417
+ | `llm-judge` | `judge: {provider, model}, criteria: string, threshold?: number` | Another LLM scores output quality (0-1) |
418
+ | `custom` | `name: string, fn: (output) => boolean` | User-provided function returning boolean (JS configs only) |
262
419
 
263
420
  ## Provider Setup
264
421
 
@@ -1 +1 @@
1
- {"version":3,"file":"builtin.d.ts","sourceRoot":"","sources":["../../src/assertions/builtin.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAc3C,KAAK,gBAAgB,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,eAAe,CAAC;AAE7F,eAAO,MAAM,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAoK9D,CAAC"}
1
+ {"version":3,"file":"builtin.d.ts","sourceRoot":"","sources":["../../src/assertions/builtin.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAc3C,KAAK,gBAAgB,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,KAAK,eAAe,CAAC;AAE7F,eAAO,MAAM,iBAAiB,EAAE,MAAM,CAAC,MAAM,EAAE,gBAAgB,CAgL9D,CAAC"}
@@ -157,6 +157,17 @@ exports.builtinAssertions = {
157
157
  actual: `${actual}ms`,
158
158
  };
159
159
  },
160
+ 'max-cost': (output, config) => {
161
+ const dollars = config.dollars;
162
+ const actual = config.__cost ?? 0;
163
+ return {
164
+ type: 'max-cost',
165
+ name: `max-cost $${dollars}`,
166
+ passed: actual <= dollars,
167
+ expected: `<= $${dollars}`,
168
+ actual: `$${actual.toFixed(6)}`,
169
+ };
170
+ },
160
171
  'no-hallucination-words': (output, config) => {
161
172
  const words = config.words ?? DEFAULT_HALLUCINATION_WORDS;
162
173
  const found = words.filter(w => output.includes(w));
@@ -1 +1 @@
1
- {"version":3,"file":"builtin.js","sourceRoot":"","sources":["../../src/assertions/builtin.ts"],"names":[],"mappings":";;;AAEA,MAAM,2BAA2B,GAAG;IAClC,UAAU;IACV,UAAU;IACV,UAAU;IACV,UAAU;IACV,sBAAsB;IACtB,kBAAkB;IAClB,kBAAkB;IAClB,qBAAqB;IACrB,qBAAqB;CACtB,CAAC;AAIW,QAAA,iBAAiB,GAAqC;IACjE,UAAU,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC7B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,aAAa,KAAK,GAAG;YAC3B,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YAC9B,QAAQ,EAAE,sBAAsB,KAAK,GAAG;YACxC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW;SACvD,CAAC;IACJ,CAAC;IAED,cAAc,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,cAAc;YACpB,IAAI,EAAE,iBAAiB,KAAK,GAAG;YAC/B,MAAM,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YAC/B,QAAQ,EAAE,0BAA0B,KAAK,GAAG;YAC5C,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW;SACvD,CAAC;IACJ,CAAC;IAED,aAAa,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAChC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,IAAI,EAAE,gBAAgB,KAAK,GAAG;YAC9B,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC;YAChC,QAAQ,EAAE,yBAAyB,KAAK,GAAG;YAC3C,MAAM,EAAE,gBAAgB,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG;SACzD,CAAC;IACJ,CAAC;IAED,WAAW,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC9B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,WAAW;YACjB,IAAI,EAAE,cAAc,KAAK,GAAG;YAC5B,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YAC9B,QAAQ,EAAE,uBAAuB,KAAK,GAAG;YACzC,MAAM,EAAE,cAAc,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG;SACrD,CAAC;IACJ,CAAC;IAED,eAAe,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAClC,MAAM,OAAO,GAAG,MAAM,CAAC,OAAiB,CAAC;QACzC,IAAI,KAAa,CAAC;QAClB,IAAI,CAAC;YACH,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC;QAC9B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO;gBACL,IAAI,EAAE,eAAe;gBACrB,IAAI,EAAE,kBAAkB,OAAO,GAAG;gBAClC,MAAM,EAAE,KAAK;gBACb,QAAQ,EAAE,oBAAoB,OAAO,GAAG;gBACxC,MAAM,EAAE,kBAAmB,CAAW,CAAC,OAAO,EAAE;gBAChD,OAAO,EAAE,0BAA2B,CAAW,CAAC,OAAO,EAAE;aAC1D,CAAC;QACJ,CAAC;QACD,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnC,OAAO;YACL,IAAI,EAAE,eAAe;YACrB,IAAI,EAAE,kBAAkB,OAAO,GAAG;YAClC,MAAM,EAAE,OAAO;YACf,QAAQ,EAAE,oBAAoB,OAAO,GAAG;YACxC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,UAAU;SACzC,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,IAAI,EAAE,cAAc,KAAK,EAAE;YAC3B,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,KAAK;YAC9B,QAAQ,EAAE,MAAM,KAAK,QAAQ;YAC7B,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,QAAQ;SACjC,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,IAAI,EAAE,cAAc,KAAK,EAAE;YAC3B,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,KAAK;YAC9B,QAAQ,EAAE,MAAM,KAAK,QAAQ;YAC7B,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,QAAQ;SACjC,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QACvB,IAAI,MAAM,GAAG,KAAK,CAAC;QACnB,IAAI,OAA2B,CAAC;QAChC,IAAI,CAAC;YACH,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YACnB,MAAM,GAAG,IAAI,CAAC;QAChB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,GAAI,CAAW,CAAC,OAAO,CAAC;QACjC,CAAC;QACD,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,IAAI,EAAE,YAAY;YAClB,MAAM;YACN,QAAQ,EAAE,YAAY;YACtB,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,iBAAiB,OAAO,EAAE;SAC3D,CAAC;IACJ,CAAC;IAED,cAAc,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QACjC,MAAM,MAAM,GAAG,MAAM,CAAC,MAAkB,CAAC;QACzC,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACxD,OAAO;YACL,IAAI,EAAE,cAAc;YACpB,IAAI,EAAE,iBAAiB,MAAM,CAAC,MAAM,SAAS;YAC7C,MAAM,EAAE,OAAO,CAAC,MAAM,KAAK,CAAC;YAC5B,QAAQ,EAAE,6BAA6B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;YAC1D,MAAM,EAAE,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW;SAC5E,CAAC;IACJ,CAAC;IAED,eAAe,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAClC,MAAM,SAAS,GAAI,MAAM,CAAC,SAAgC,IAAI,IAAI,CAAC;QACnE,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzE,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAC/B,MAAM,UAAU,GAAa,EAAE,CAAC;QAChC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;QACD,OAAO;YACL,IAAI,EAAE,eAAe;YACrB,IAAI,EAAE,eAAe;YACrB,MAAM,EAAE,UAAU,CAAC,MAAM,KAAK,CAAC;YAC/B,QAAQ,EAAE,oBAAoB;YAC9B,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,eAAe;SACzF,CAAC;IACJ,CAAC;IAED,aAAa,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAChC,uEAAuE;QACvE,MAAM,EAAE,GAAG,MAAM,CAAC,EAAY,CAAC;QAC/B,MAAM,MAAM,GAAI,MAAM,CAAC,UAAqB,IAAI,CAAC,CAAC;QAClD,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,IAAI,EAAE,eAAe,EAAE,IAAI;YAC3B,MAAM,EAAE,MAAM,IAAI,EAAE;YACpB,QAAQ,EAAE,MAAM,EAAE,IAAI;YACtB,MAAM,EAAE,GAAG,MAAM,IAAI;SACtB,CAAC;IACJ,CAAC;IAED,wBAAwB,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC3C,MAAM,KAAK,GAAI,MAAM,CAAC,KAA8B,IAAI,2BAA2B,CAAC;QACpF,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACpD,OAAO;YACL,IAAI,EAAE,wBAAwB;YAC9B,IAAI,EAAE,wBAAwB;YAC9B,MAAM,EAAE,KAAK,CAAC,MAAM,KAAK,CAAC;YAC1B,QAAQ,EAAE,wBAAwB;YAClC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY;SACvE,CAAC;IACJ,CAAC;CACF,CAAC"}
1
+ {"version":3,"file":"builtin.js","sourceRoot":"","sources":["../../src/assertions/builtin.ts"],"names":[],"mappings":";;;AAEA,MAAM,2BAA2B,GAAG;IAClC,UAAU;IACV,UAAU;IACV,UAAU;IACV,UAAU;IACV,sBAAsB;IACtB,kBAAkB;IAClB,kBAAkB;IAClB,qBAAqB;IACrB,qBAAqB;CACtB,CAAC;AAIW,QAAA,iBAAiB,GAAqC;IACjE,UAAU,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC7B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,aAAa,KAAK,GAAG;YAC3B,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YAC9B,QAAQ,EAAE,sBAAsB,KAAK,GAAG;YACxC,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW;SACvD,CAAC;IACJ,CAAC;IAED,cAAc,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,cAAc;YACpB,IAAI,EAAE,iBAAiB,KAAK,GAAG;YAC/B,MAAM,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YAC/B,QAAQ,EAAE,0BAA0B,KAAK,GAAG;YAC5C,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,WAAW;SACvD,CAAC;IACJ,CAAC;IAED,aAAa,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAChC,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,IAAI,EAAE,gBAAgB,KAAK,GAAG;YAC9B,MAAM,EAAE,MAAM,CAAC,UAAU,CAAC,KAAK,CAAC;YAChC,QAAQ,EAAE,yBAAyB,KAAK,GAAG;YAC3C,MAAM,EAAE,gBAAgB,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,MAAM,CAAC,GAAG;SACzD,CAAC;IACJ,CAAC;IAED,WAAW,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC9B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,WAAW;YACjB,IAAI,EAAE,cAAc,KAAK,GAAG;YAC5B,MAAM,EAAE,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC;YAC9B,QAAQ,EAAE,uBAAuB,KAAK,GAAG;YACzC,MAAM,EAAE,cAAc,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG;SACrD,CAAC;IACJ,CAAC;IAED,eAAe,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAClC,MAAM,OAAO,GAAG,MAAM,CAAC,OAAiB,CAAC;QACzC,IAAI,KAAa,CAAC;QAClB,IAAI,CAAC;YACH,KAAK,GAAG,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC;QAC9B,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO;gBACL,IAAI,EAAE,eAAe;gBACrB,IAAI,EAAE,kBAAkB,OAAO,GAAG;gBAClC,MAAM,EAAE,KAAK;gBACb,QAAQ,EAAE,oBAAoB,OAAO,GAAG;gBACxC,MAAM,EAAE,kBAAmB,CAAW,CAAC,OAAO,EAAE;gBAChD,OAAO,EAAE,0BAA2B,CAAW,CAAC,OAAO,EAAE;aAC1D,CAAC;QACJ,CAAC;QACD,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACnC,OAAO;YACL,IAAI,EAAE,eAAe;YACrB,IAAI,EAAE,kBAAkB,OAAO,GAAG;YAClC,MAAM,EAAE,OAAO;YACf,QAAQ,EAAE,oBAAoB,OAAO,GAAG;YACxC,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,UAAU;SACzC,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,IAAI,EAAE,cAAc,KAAK,EAAE;YAC3B,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,KAAK;YAC9B,QAAQ,EAAE,MAAM,KAAK,QAAQ;YAC7B,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,QAAQ;SACjC,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAe,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,IAAI,EAAE,cAAc,KAAK,EAAE;YAC3B,MAAM,EAAE,MAAM,CAAC,MAAM,IAAI,KAAK;YAC9B,QAAQ,EAAE,MAAM,KAAK,QAAQ;YAC7B,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,QAAQ;SACjC,CAAC;IACJ,CAAC;IAED,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QACvB,IAAI,MAAM,GAAG,KAAK,CAAC;QACnB,IAAI,OAA2B,CAAC;QAChC,IAAI,CAAC;YACH,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YACnB,MAAM,GAAG,IAAI,CAAC;QAChB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,GAAI,CAAW,CAAC,OAAO,CAAC;QACjC,CAAC;QACD,OAAO;YACL,IAAI,EAAE,YAAY;YAClB,IAAI,EAAE,YAAY;YAClB,MAAM;YACN,QAAQ,EAAE,YAAY;YACtB,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,iBAAiB,OAAO,EAAE;SAC3D,CAAC;IACJ,CAAC;IAED,cAAc,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QACjC,MAAM,MAAM,GAAG,MAAM,CAAC,MAAkB,CAAC;QACzC,MAAM,OAAO,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACxD,OAAO;YACL,IAAI,EAAE,cAAc;YACpB,IAAI,EAAE,iBAAiB,MAAM,CAAC,MAAM,SAAS;YAC7C,MAAM,EAAE,OAAO,CAAC,MAAM,KAAK,CAAC;YAC5B,QAAQ,EAAE,6BAA6B,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;YAC1D,MAAM,EAAE,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW;SAC5E,CAAC;IACJ,CAAC;IAED,eAAe,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAClC,MAAM,SAAS,GAAI,MAAM,CAAC,SAAgC,IAAI,IAAI,CAAC;QACnE,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACzE,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAC/B,MAAM,UAAU,GAAa,EAAE,CAAC;QAChC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;QACD,OAAO;YACL,IAAI,EAAE,eAAe;YACrB,IAAI,EAAE,eAAe;YACrB,MAAM,EAAE,UAAU,CAAC,MAAM,KAAK,CAAC;YAC/B,QAAQ,EAAE,oBAAoB;YAC9B,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,eAAe;SACzF,CAAC;IACJ,CAAC;IAED,aAAa,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAChC,uEAAuE;QACvE,MAAM,EAAE,GAAG,MAAM,CAAC,EAAY,CAAC;QAC/B,MAAM,MAAM,GAAI,MAAM,CAAC,UAAqB,IAAI,CAAC,CAAC;QAClD,OAAO;YACL,IAAI,EAAE,aAAa;YACnB,IAAI,EAAE,eAAe,EAAE,IAAI;YAC3B,MAAM,EAAE,MAAM,IAAI,EAAE;YACpB,QAAQ,EAAE,MAAM,EAAE,IAAI;YACtB,MAAM,EAAE,GAAG,MAAM,IAAI;SACtB,CAAC;IACJ,CAAC;IAED,UAAU,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC7B,MAAM,OAAO,GAAG,MAAM,CAAC,OAAiB,CAAC;QACzC,MAAM,MAAM,GAAI,MAAM,CAAC,MAAiB,IAAI,CAAC,CAAC;QAC9C,OAAO;YACL,IAAI,EAAE,UAAU;YAChB,IAAI,EAAE,aAAa,OAAO,EAAE;YAC5B,MAAM,EAAE,MAAM,IAAI,OAAO;YACzB,QAAQ,EAAE,OAAO,OAAO,EAAE;YAC1B,MAAM,EAAE,IAAI,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE;SAChC,CAAC;IACJ,CAAC;IAED,wBAAwB,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,EAAE;QAC3C,MAAM,KAAK,GAAI,MAAM,CAAC,KAA8B,IAAI,2BAA2B,CAAC;QACpF,MAAM,KAAK,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;QACpD,OAAO;YACL,IAAI,EAAE,wBAAwB;YAC9B,IAAI,EAAE,wBAAwB;YAC9B,MAAM,EAAE,KAAK,CAAC,MAAM,KAAK,CAAC;YAC1B,QAAQ,EAAE,wBAAwB;YAClC,MAAM,EAAE,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,YAAY;SACvE,CAAC;IACJ,CAAC;CACF,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/assertions/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAK5D,wBAAsB,aAAa,CACjC,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,eAAe,EAAE,GAC5B,OAAO,CAAC,eAAe,EAAE,CAAC,CA6B5B"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/assertions/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,UAAU,CAAC;AAM5D,wBAAsB,aAAa,CACjC,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,eAAe,EAAE,GAC5B,OAAO,CAAC,eAAe,EAAE,CAAC,CAuC5B"}
@@ -4,6 +4,7 @@ exports.runAssertions = runAssertions;
4
4
  const builtin_1 = require("./builtin");
5
5
  const json_schema_1 = require("./json-schema");
6
6
  const custom_1 = require("./custom");
7
+ const llm_judge_1 = require("./llm-judge");
7
8
  async function runAssertions(output, assertions) {
8
9
  const results = [];
9
10
  for (const assertion of assertions) {
@@ -11,6 +12,10 @@ async function runAssertions(output, assertions) {
11
12
  results.push((0, json_schema_1.assertJsonSchema)(output, assertion.schema));
12
13
  continue;
13
14
  }
15
+ if (assertion.type === 'llm-judge') {
16
+ results.push(await (0, llm_judge_1.assertLlmJudge)(output, assertion.judge, assertion.criteria, assertion.threshold ?? 0.7));
17
+ continue;
18
+ }
14
19
  if (assertion.type === 'custom') {
15
20
  results.push(await (0, custom_1.assertCustom)(output, assertion.name, assertion.fn));
16
21
  continue;
@@ -1 +1 @@
1
- {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/assertions/index.ts"],"names":[],"mappings":";;AAKA,sCAgCC;AApCD,uCAA8C;AAC9C,+CAAiD;AACjD,qCAAwC;AAEjC,KAAK,UAAU,aAAa,CACjC,MAAc,EACd,UAA6B;IAE7B,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,IAAI,SAAS,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;YACrC,OAAO,CAAC,IAAI,CAAC,IAAA,8BAAgB,EAAC,MAAM,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC;YACzD,SAAS;QACX,CAAC;QAED,IAAI,SAAS,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAChC,OAAO,CAAC,IAAI,CAAC,MAAM,IAAA,qBAAY,EAAC,MAAM,EAAE,SAAS,CAAC,IAAI,EAAE,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC;YACvE,SAAS;QACX,CAAC;QAED,MAAM,OAAO,GAAG,2BAAiB,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QAClD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,SAAS,CAAC,IAAI;gBACpB,IAAI,EAAE,SAAS,CAAC,IAAI;gBACpB,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE,4BAA4B,SAAS,CAAC,IAAI,GAAG;aACvD,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,SAA+C,CAAC,CAAC,CAAC;IACjF,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/assertions/index.ts"],"names":[],"mappings":";;AAMA,sCA0CC;AA/CD,uCAA8C;AAC9C,+CAAiD;AACjD,qCAAwC;AACxC,2CAA6C;AAEtC,KAAK,UAAU,aAAa,CACjC,MAAc,EACd,UAA6B;IAE7B,MAAM,OAAO,GAAsB,EAAE,CAAC;IAEtC,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;QACnC,IAAI,SAAS,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;YACrC,OAAO,CAAC,IAAI,CAAC,IAAA,8BAAgB,EAAC,MAAM,EAAE,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC;YACzD,SAAS;QACX,CAAC;QAED,IAAI,SAAS,CAAC,IAAI,KAAK,WAAW,EAAE,CAAC;YACnC,OAAO,CAAC,IAAI,CAAC,MAAM,IAAA,0BAAc,EAC/B,MAAM,EACN,SAAS,CAAC,KAAK,EACf,SAAS,CAAC,QAAQ,EAClB,SAAS,CAAC,SAAS,IAAI,GAAG,CAC3B,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,IAAI,SAAS,CAAC,IAAI,KAAK,QAAQ,EAAE,CAAC;YAChC,OAAO,CAAC,IAAI,CAAC,MAAM,IAAA,qBAAY,EAAC,MAAM,EAAE,SAAS,CAAC,IAAI,EAAE,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC;YACvE,SAAS;QACX,CAAC;QAED,MAAM,OAAO,GAAG,2BAAiB,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;QAClD,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,OAAO,CAAC,IAAI,CAAC;gBACX,IAAI,EAAE,SAAS,CAAC,IAAI;gBACpB,IAAI,EAAE,SAAS,CAAC,IAAI;gBACpB,MAAM,EAAE,KAAK;gBACb,OAAO,EAAE,4BAA4B,SAAS,CAAC,IAAI,GAAG;aACvD,CAAC,CAAC;YACH,SAAS;QACX,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,EAAE,SAA+C,CAAC,CAAC,CAAC;IACjF,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,6 @@
1
+ import { AssertionResult, ProviderConfig } from '../types';
2
+ export declare function assertLlmJudge(output: string, judgeConfig: {
3
+ provider: ProviderConfig;
4
+ model: string;
5
+ }, criteria: string, threshold: number): Promise<AssertionResult>;
6
+ //# sourceMappingURL=llm-judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm-judge.d.ts","sourceRoot":"","sources":["../../src/assertions/llm-judge.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAW3D,wBAAsB,cAAc,CAClC,MAAM,EAAE,MAAM,EACd,WAAW,EAAE;IAAE,QAAQ,EAAE,cAAc,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,EACxD,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,eAAe,CAAC,CAyB1B"}
@@ -0,0 +1,75 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.assertLlmJudge = assertLlmJudge;
4
+ const providers_1 = require("../providers");
5
+ const JUDGE_PROMPT = `You are an evaluator. Score the following LLM output on a scale of 0.0 to 1.0 based on the given criteria.
6
+ Respond with ONLY a JSON object: {"score": <number>, "explanation": "<brief reason>"}
7
+
8
+ Criteria: {{criteria}}
9
+
10
+ Output to evaluate:
11
+ {{output}}`;
12
+ async function assertLlmJudge(output, judgeConfig, criteria, threshold) {
13
+ try {
14
+ const judge = (0, providers_1.getProvider)(judgeConfig.provider, judgeConfig.model);
15
+ const prompt = JUDGE_PROMPT
16
+ .replace('{{criteria}}', criteria)
17
+ .replace('{{output}}', output);
18
+ const response = await judge.call(prompt);
19
+ const { score, explanation } = parseJudgeResponse(response);
20
+ return {
21
+ type: 'llm-judge',
22
+ name: `llm-judge: ${criteria.slice(0, 50)}`,
23
+ passed: score >= threshold,
24
+ expected: `>= ${threshold}`,
25
+ actual: `${score.toFixed(2)} — ${explanation}`,
26
+ };
27
+ }
28
+ catch (error) {
29
+ return {
30
+ type: 'llm-judge',
31
+ name: `llm-judge: ${criteria.slice(0, 50)}`,
32
+ passed: false,
33
+ message: `Judge error: ${error.message}`,
34
+ };
35
+ }
36
+ }
37
+ function parseJudgeResponse(response) {
38
+ // Try JSON parse first
39
+ try {
40
+ const json = JSON.parse(response.trim());
41
+ if (typeof json.score === 'number') {
42
+ return {
43
+ score: Math.max(0, Math.min(1, json.score)),
44
+ explanation: json.explanation || '',
45
+ };
46
+ }
47
+ }
48
+ catch {
49
+ // Fall through to regex
50
+ }
51
+ // Try to extract JSON from response (judge might wrap in markdown)
52
+ const jsonMatch = response.match(/\{[^}]*"score"\s*:\s*([\d.]+)[^}]*\}/);
53
+ if (jsonMatch) {
54
+ try {
55
+ const json = JSON.parse(jsonMatch[0]);
56
+ return {
57
+ score: Math.max(0, Math.min(1, json.score)),
58
+ explanation: json.explanation || '',
59
+ };
60
+ }
61
+ catch {
62
+ // Fall through
63
+ }
64
+ }
65
+ // Last resort: extract any number between 0 and 1
66
+ const numMatch = response.match(/\b(0(?:\.\d+)?|1(?:\.0+)?)\b/);
67
+ if (numMatch) {
68
+ return {
69
+ score: parseFloat(numMatch[1]),
70
+ explanation: 'Score extracted from unstructured response',
71
+ };
72
+ }
73
+ throw new Error(`Could not parse judge response: ${response.slice(0, 200)}`);
74
+ }
75
+ //# sourceMappingURL=llm-judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"llm-judge.js","sourceRoot":"","sources":["../../src/assertions/llm-judge.ts"],"names":[],"mappings":";;AAWA,wCA8BC;AAxCD,4CAA2C;AAE3C,MAAM,YAAY,GAAG;;;;;;WAMV,CAAC;AAEL,KAAK,UAAU,cAAc,CAClC,MAAc,EACd,WAAwD,EACxD,QAAgB,EAChB,SAAiB;IAEjB,IAAI,CAAC;QACH,MAAM,KAAK,GAAG,IAAA,uBAAW,EAAC,WAAW,CAAC,QAAQ,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC;QACnE,MAAM,MAAM,GAAG,YAAY;aACxB,OAAO,CAAC,cAAc,EAAE,QAAQ,CAAC;aACjC,OAAO,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC;QAEjC,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAC1C,MAAM,EAAE,KAAK,EAAE,WAAW,EAAE,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAC;QAE5D,OAAO;YACL,IAAI,EAAE,WAAW;YACjB,IAAI,EAAE,cAAc,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE;YAC3C,MAAM,EAAE,KAAK,IAAI,SAAS;YAC1B,QAAQ,EAAE,MAAM,SAAS,EAAE;YAC3B,MAAM,EAAE,GAAG,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,WAAW,EAAE;SAC/C,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,IAAI,EAAE,WAAW;YACjB,IAAI,EAAE,cAAc,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE;YAC3C,MAAM,EAAE,KAAK;YACb,OAAO,EAAE,gBAAiB,KAAe,CAAC,OAAO,EAAE;SACpD,CAAC;IACJ,CAAC;AACH,CAAC;AAED,SAAS,kBAAkB,CAAC,QAAgB;IAC1C,uBAAuB;IACvB,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;QACzC,IAAI,OAAO,IAAI,CAAC,KAAK,KAAK,QAAQ,EAAE,CAAC;YACnC,OAAO;gBACL,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;gBAC3C,WAAW,EAAE,IAAI,CAAC,WAAW,IAAI,EAAE;aACpC,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;IAED,mEAAmE;IACnE,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,sCAAsC,CAAC,CAAC;IACzE,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,CAAC;YACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC;YACtC,OAAO;gBACL,KAAK,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;gBAC3C,WAAW,EAAE,IAAI,CAAC,WAAW,IAAI,EAAE;aACpC,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IAED,kDAAkD;IAClD,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAChE,IAAI,QAAQ,EAAE,CAAC;QACb,OAAO;YACL,KAAK,EAAE,UAAU,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YAC9B,WAAW,EAAE,4CAA4C;SAC1D,CAAC;IACJ,CAAC;IAED,MAAM,IAAI,KAAK,CAAC,mCAAmC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC,CAAC;AAC/E,CAAC"}
package/dist/cli.js CHANGED
@@ -41,18 +41,18 @@ const commander_1 = require("commander");
41
41
  const path = __importStar(require("path"));
42
42
  const fs = __importStar(require("fs"));
43
43
  const chalk_1 = __importDefault(require("chalk"));
44
- const config_validation_1 = require("./config-validation");
45
44
  const runner_1 = require("./runner");
46
45
  const github_1 = require("./github");
47
46
  const cache_1 = require("./cache");
48
47
  const snapshot_1 = require("./snapshot");
49
48
  const reporter_1 = require("./reporter");
49
+ const config_loader_1 = require("./config-loader");
50
50
  const utils_1 = require("./utils");
51
51
  const program = new commander_1.Command();
52
52
  program
53
53
  .name('prompt-lock')
54
54
  .description('Version control and behavioral regression testing for LLM prompts')
55
- .version('0.2.0');
55
+ .version('0.4.0');
56
56
  // ── init ──────────────────────────────────────────────────────────────────────
57
57
  program
58
58
  .command('init')
@@ -166,6 +166,8 @@ program
166
166
  .option('--cache', 'Cache LLM outputs (skip calls when prompt+model unchanged)')
167
167
  .option('--no-cache', 'Disable output caching')
168
168
  .option('--github-pr <pr>', 'Post results as a GitHub PR comment (e.g. owner/repo#123)')
169
+ .option('--watch', 'Watch for file changes and re-run automatically')
170
+ .option('--ab <pair>', 'Compare two prompt IDs side-by-side (e.g. "v1:v2")')
169
171
  .action(async (opts) => {
170
172
  const configs = await loadPromptConfigs(opts.config, opts.id);
171
173
  if (configs.length === 0) {
@@ -174,6 +176,54 @@ program
174
176
  process.exitCode = 1;
175
177
  return;
176
178
  }
179
+ // A/B testing mode
180
+ if (opts.ab) {
181
+ const [idA, idB] = String(opts.ab).split(':');
182
+ if (!idA || !idB) {
183
+ console.log(chalk_1.default.red('Invalid --ab format. Use: --ab <variant-a-id>:<variant-b-id>'));
184
+ process.exitCode = 1;
185
+ return;
186
+ }
187
+ const variantA = configs.find(c => c.id === idA);
188
+ const variantB = configs.find(c => c.id === idB);
189
+ if (!variantA) {
190
+ console.log(chalk_1.default.red(`Variant A not found: "${idA}"`));
191
+ process.exitCode = 1;
192
+ return;
193
+ }
194
+ if (!variantB) {
195
+ console.log(chalk_1.default.red(`Variant B not found: "${idB}"`));
196
+ process.exitCode = 1;
197
+ return;
198
+ }
199
+ const projectConfigAB = loadProjectConfig(opts.config);
200
+ const cacheDirAB = projectConfigAB?.snapshotDir
201
+ ? path.join(path.dirname(projectConfigAB.snapshotDir), 'cache')
202
+ : '.promptlock/cache';
203
+ const abRunOpts = {
204
+ verbose: opts.verbose,
205
+ cache: opts.cache ?? false,
206
+ cacheDir: cacheDirAB,
207
+ retry: { maxRetries: 3 },
208
+ };
209
+ const spinAB = (0, utils_1.spinner)(`Running A/B: ${idA} vs ${idB}...`);
210
+ const abResult = await (0, runner_1.runAB)(variantA, variantB, abRunOpts);
211
+ spinAB.stop(`Completed A/B comparison.`);
212
+ (0, reporter_1.printABReport)(abResult);
213
+ // Write markdown report if requested
214
+ const abReportDir = projectConfigAB?.reportDir ?? '.promptlock/reports';
215
+ const abFormats = parseReportFormats(opts.report, projectConfigAB);
216
+ if (abFormats.includes('markdown')) {
217
+ const p = await (0, reporter_1.generateABMarkdownReport)(abResult, abReportDir);
218
+ console.log(chalk_1.default.dim(`Report saved: ${p}`));
219
+ }
220
+ // Exit code for CI
221
+ const abFailed = !abResult.variantA.passed || !abResult.variantB.passed;
222
+ if (abFailed && (opts.ci || projectConfigAB?.ci?.failOnRegression)) {
223
+ process.exitCode = 1;
224
+ }
225
+ return;
226
+ }
177
227
  if (opts.dryRun) {
178
228
  console.log(chalk_1.default.dim(`[dry-run] Would test ${configs.length} prompt${configs.length !== 1 ? 's' : ''}:`));
179
229
  for (const c of configs) {
@@ -221,6 +271,10 @@ program
221
271
  const p = await (0, reporter_1.generateHtmlReport)(results, reportDir);
222
272
  console.log(chalk_1.default.dim(`Report saved: ${p}`));
223
273
  }
274
+ if (format === 'markdown') {
275
+ const p = await (0, reporter_1.generateMarkdownReport)(results, reportDir);
276
+ console.log(chalk_1.default.dim(`Report saved: ${p}`));
277
+ }
224
278
  }
225
279
  // Post GitHub PR comment
226
280
  if (opts.githubPr) {
@@ -256,6 +310,44 @@ program
256
310
  if (anyFailed && (opts.ci || projectConfig?.ci?.failOnRegression)) {
257
311
  process.exitCode = 1;
258
312
  }
313
+ // Watch mode
314
+ if (opts.watch) {
315
+ console.log('');
316
+ console.log(chalk_1.default.dim('Watching for changes... (press Ctrl+C to stop)'));
317
+ const promptsDir = path.join(process.cwd(), 'prompts');
318
+ const watchTargets = [promptsDir];
319
+ const discovered = await (0, config_loader_1.discoverConfigFile)(process.cwd());
320
+ if (discovered)
321
+ watchTargets.push(discovered);
322
+ let debounceTimer = null;
323
+ for (const target of watchTargets) {
324
+ try {
325
+ fs.watch(target, { recursive: true }, () => {
326
+ if (debounceTimer)
327
+ clearTimeout(debounceTimer);
328
+ debounceTimer = setTimeout(async () => {
329
+ console.clear();
330
+ console.log(chalk_1.default.dim('Re-running...'));
331
+ try {
332
+ const reConfigs = await loadPromptConfigs(opts.config, opts.id);
333
+ const reResults = await (0, runner_1.runAll)(reConfigs, runOpts);
334
+ (0, reporter_1.printConsoleReport)(reResults);
335
+ }
336
+ catch (e) {
337
+ console.error(chalk_1.default.red(`Error: ${e.message}`));
338
+ }
339
+ console.log('');
340
+ console.log(chalk_1.default.dim('Watching for changes... (press Ctrl+C to stop)'));
341
+ }, 500);
342
+ });
343
+ }
344
+ catch {
345
+ // Target doesn't exist, skip
346
+ }
347
+ }
348
+ // Keep process alive
349
+ await new Promise(() => { });
350
+ }
259
351
  });
260
352
  // ── snapshot ──────────────────────────────────────────────────────────────────
261
353
  program
@@ -379,27 +471,38 @@ function loadProjectConfig(configPath) {
379
471
  }
380
472
  }
381
473
  async function loadPromptConfigs(configPath, filterId) {
474
+ const configs = [];
475
+ // Try YAML/JSON config file first (auto-discovery)
476
+ const discovered = await (0, config_loader_1.discoverConfigFile)(process.cwd());
477
+ if (discovered) {
478
+ try {
479
+ const loaded = await (0, config_loader_1.loadConfigFile)(discovered);
480
+ const items = Array.isArray(loaded) ? loaded : [loaded];
481
+ configs.push(...items);
482
+ }
483
+ catch (e) {
484
+ console.error(chalk_1.default.red(`Error loading ${discovered}: ${e.message}`));
485
+ }
486
+ }
487
+ // Also scan prompts/ directory for JS/JSON/YAML files
382
488
  const projectConfig = loadProjectConfig(configPath);
383
489
  const promptsDir = projectConfig?.promptsDir
384
490
  ? path.resolve(process.cwd(), projectConfig.promptsDir)
385
491
  : path.join(process.cwd(), 'prompts');
386
- const configs = [];
387
492
  try {
388
493
  const files = fs.readdirSync(promptsDir);
389
494
  for (const file of files) {
390
- if (!file.endsWith('.js') && !file.endsWith('.json'))
495
+ if (!['.js', '.json', '.yaml', '.yml'].some(ext => file.endsWith(ext)))
391
496
  continue;
392
497
  const filePath = path.join(promptsDir, file);
393
498
  try {
394
- const mod = require(filePath);
395
- const config = mod.default ?? mod;
396
- const items = Array.isArray(config) ? config : (config && config.id) ? [config] : [];
499
+ const loaded = await (0, config_loader_1.loadConfigFile)(filePath);
500
+ const items = Array.isArray(loaded) ? loaded : [loaded];
397
501
  for (const item of items) {
398
- const validation = (0, config_validation_1.validateConfig)(item);
399
- if (!validation.valid) {
400
- console.warn(chalk_1.default.yellow(`⚠️ ${file}: ${validation.errors.join('; ')}`));
502
+ // Avoid duplicates from auto-discovery
503
+ if (!configs.some(c => c.id === item.id)) {
504
+ configs.push(item);
401
505
  }
402
- configs.push(item);
403
506
  }
404
507
  }
405
508
  catch (e) {