promptlab-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
promptlab/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """promptlab: Automated testing for LLM prompts."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,182 @@
1
+ """Assertion types for evaluating LLM outputs."""
2
+
3
+ import re
4
+
5
+ from promptlab.providers import call_llm
6
+
7
+
8
+ def check_assertion(assertion: dict, output: str, model: str) -> dict:
9
+ """Check a single assertion against an LLM output.
10
+
11
+ Returns:
12
+ dict with keys: passed (bool), type, expected, got, message
13
+ """
14
+ atype = assertion["type"]
15
+ value = assertion.get("value")
16
+
17
+ checkers = {
18
+ "contains": _check_contains,
19
+ "not_contains": _check_not_contains,
20
+ "starts_with": _check_starts_with,
21
+ "regex": _check_regex,
22
+ "equals": _check_equals,
23
+ "max_tokens": _check_max_tokens,
24
+ "min_length": _check_min_length,
25
+ "max_length": _check_max_length,
26
+ "llm_judge": _check_llm_judge,
27
+ }
28
+
29
+ checker = checkers.get(atype)
30
+ if checker is None:
31
+ return {
32
+ "passed": False,
33
+ "type": atype,
34
+ "expected": value,
35
+ "got": None,
36
+ "message": f"Unknown assertion type: {atype}",
37
+ }
38
+
39
+ if atype == "llm_judge":
40
+ return checker(output, value, model)
41
+ return checker(output, value)
42
+
43
+
44
+ def _check_contains(output: str, value: str) -> dict:
45
+ passed = value.lower() in output.lower()
46
+ return {
47
+ "passed": passed,
48
+ "type": "contains",
49
+ "expected": f'contains "{value}"',
50
+ "got": output[:200] + "..." if len(output) > 200 else output,
51
+ "message": "" if passed else f'Output does not contain "{value}"',
52
+ }
53
+
54
+
55
+ def _check_not_contains(output: str, value: str) -> dict:
56
+ passed = value.lower() not in output.lower()
57
+ return {
58
+ "passed": passed,
59
+ "type": "not_contains",
60
+ "expected": f'does not contain "{value}"',
61
+ "got": output[:200] + "..." if len(output) > 200 else output,
62
+ "message": "" if passed else f'Output contains "{value}" (should not)',
63
+ }
64
+
65
+
66
+ def _check_starts_with(output: str, value: str) -> dict:
67
+ passed = output.strip().lower().startswith(value.lower())
68
+ return {
69
+ "passed": passed,
70
+ "type": "starts_with",
71
+ "expected": f'starts with "{value}"',
72
+ "got": output[:100],
73
+ "message": "" if passed else f'Output does not start with "{value}"',
74
+ }
75
+
76
+
77
+ def _check_regex(output: str, value: str) -> dict:
78
+ try:
79
+ passed = bool(re.search(value, output))
80
+ except re.error as e:
81
+ return {
82
+ "passed": False,
83
+ "type": "regex",
84
+ "expected": f"matches /{value}/",
85
+ "got": None,
86
+ "message": f"Invalid regex: {e}",
87
+ }
88
+ return {
89
+ "passed": passed,
90
+ "type": "regex",
91
+ "expected": f"matches /{value}/",
92
+ "got": output[:200] + "..." if len(output) > 200 else output,
93
+ "message": "" if passed else f"Output does not match regex /{value}/",
94
+ }
95
+
96
+
97
+ def _check_equals(output: str, value: str) -> dict:
98
+ passed = output.strip() == value.strip()
99
+ return {
100
+ "passed": passed,
101
+ "type": "equals",
102
+ "expected": value,
103
+ "got": output.strip(),
104
+ "message": "" if passed else "Output does not exactly match expected value",
105
+ }
106
+
107
+
108
+ def _check_max_tokens(output: str, value: int) -> dict:
109
+ # Rough token estimate: ~4 chars per token
110
+ token_estimate = len(output.split())
111
+ passed = token_estimate <= value
112
+ return {
113
+ "passed": passed,
114
+ "type": "max_tokens",
115
+ "expected": f"<= {value} tokens",
116
+ "got": f"~{token_estimate} tokens ({len(output)} chars)",
117
+ "message": "" if passed else f"Output has ~{token_estimate} tokens, max is {value}",
118
+ }
119
+
120
+
121
+ def _check_min_length(output: str, value: int) -> dict:
122
+ length = len(output.strip())
123
+ passed = length >= value
124
+ return {
125
+ "passed": passed,
126
+ "type": "min_length",
127
+ "expected": f">= {value} chars",
128
+ "got": f"{length} chars",
129
+ "message": "" if passed else f"Output is {length} chars, minimum is {value}",
130
+ }
131
+
132
+
133
+ def _check_max_length(output: str, value: int) -> dict:
134
+ length = len(output.strip())
135
+ passed = length <= value
136
+ return {
137
+ "passed": passed,
138
+ "type": "max_length",
139
+ "expected": f"<= {value} chars",
140
+ "got": f"{length} chars",
141
+ "message": "" if passed else f"Output is {length} chars, maximum is {value}",
142
+ }
143
+
144
+
145
+ def _check_llm_judge(output: str, criteria: str, model: str) -> dict:
146
+ """Use an LLM to judge the output quality."""
147
+ judge_prompt = f"""You are evaluating an LLM output. Answer only YES or NO.
148
+
149
+ Criteria: {criteria}
150
+
151
+ Output to evaluate:
152
+ ---
153
+ {output}
154
+ ---
155
+
156
+ Does the output meet the criteria? Answer only YES or NO."""
157
+
158
+ try:
159
+ response = call_llm(
160
+ prompt=judge_prompt,
161
+ model=model,
162
+ temperature=0,
163
+ max_tokens=10,
164
+ )
165
+ answer = response["text"].strip().upper()
166
+ passed = answer.startswith("YES")
167
+ except Exception as e:
168
+ return {
169
+ "passed": False,
170
+ "type": "llm_judge",
171
+ "expected": criteria,
172
+ "got": f"Judge error: {e}",
173
+ "message": f"LLM judge failed: {e}",
174
+ }
175
+
176
+ return {
177
+ "passed": passed,
178
+ "type": "llm_judge",
179
+ "expected": criteria,
180
+ "got": f"Judge answered: {answer}",
181
+ "message": "" if passed else f"LLM judge said NO to: {criteria}",
182
+ }
promptlab/cli.py ADDED
@@ -0,0 +1,81 @@
1
+ """CLI entry point for promptlab."""
2
+
3
+ import json
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+
8
+ import click
9
+
10
+ from promptlab.loader import load_test_files
11
+ from promptlab.runner import run_all_tests
12
+ from promptlab.reporter import print_results, print_summary
13
+
14
+
15
+ @click.group()
16
+ @click.version_option(version="0.1.0")
17
+ def main():
18
+ """promptlab — Automated testing for LLM prompts."""
19
+ pass
20
+
21
+
22
+ @main.command()
23
+ @click.argument("path", type=click.Path(exists=True))
24
+ @click.option("--verbose", "-v", is_flag=True, help="Show full LLM responses")
25
+ @click.option("--json-output", "--json", "json_out", is_flag=True, help="Output results as JSON")
26
+ @click.option("--dry-run", is_flag=True, help="Show tests without calling APIs")
27
+ def run(path: str, verbose: bool, json_out: bool, dry_run: bool):
28
+ """Run prompt tests from a file or directory."""
29
+ target = Path(path)
30
+
31
+ # Collect test files
32
+ if target.is_file():
33
+ files = [target]
34
+ elif target.is_dir():
35
+ files = sorted(target.glob("**/*.yaml")) + sorted(target.glob("**/*.yml"))
36
+ else:
37
+ click.echo(f"Error: {path} is not a file or directory", err=True)
38
+ sys.exit(1)
39
+
40
+ if not files:
41
+ click.echo(f"No .yaml or .yml files found in {path}", err=True)
42
+ sys.exit(1)
43
+
44
+ # Load test suites
45
+ suites = load_test_files(files)
46
+
47
+ if dry_run:
48
+ for suite in suites:
49
+ click.echo(f"\n📄 {suite['file']}")
50
+ click.echo(f" Model: {suite['model']}")
51
+ click.echo(f" Tests: {len(suite['tests'])}")
52
+ for test in suite["tests"]:
53
+ assertions = ", ".join(a["type"] for a in test["assert"])
54
+ click.echo(f" - {test['name']} [{assertions}]")
55
+ return
56
+
57
+ # Run tests
58
+ start = time.time()
59
+ results = run_all_tests(suites, verbose=verbose)
60
+ elapsed = time.time() - start
61
+
62
+ if json_out:
63
+ output = {
64
+ "results": results,
65
+ "elapsed_seconds": round(elapsed, 2),
66
+ "total": len(results),
67
+ "passed": sum(1 for r in results if r["passed"]),
68
+ "failed": sum(1 for r in results if not r["passed"]),
69
+ }
70
+ click.echo(json.dumps(output, indent=2))
71
+ else:
72
+ print_results(results, verbose=verbose)
73
+ print_summary(results, elapsed)
74
+
75
+ # Exit with non-zero if any test failed
76
+ if any(not r["passed"] for r in results):
77
+ sys.exit(1)
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
promptlab/loader.py ADDED
@@ -0,0 +1,89 @@
1
+ """Load and validate YAML test files."""
2
+
3
+ import re
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+
10
+ def load_test_files(files: list[Path]) -> list[dict]:
11
+ """Load and validate test suites from YAML files."""
12
+ suites = []
13
+ for file in files:
14
+ suite = _load_single_file(file)
15
+ suites.append(suite)
16
+ return suites
17
+
18
+
19
+ def _load_single_file(file: Path) -> dict:
20
+ """Load a single YAML test file."""
21
+ with open(file) as f:
22
+ raw = yaml.safe_load(f)
23
+
24
+ if not isinstance(raw, dict):
25
+ raise ValueError(f"{file}: Expected a YAML mapping at the top level")
26
+
27
+ # Required fields
28
+ if "prompt" not in raw:
29
+ raise ValueError(f"{file}: Missing required 'prompt' field")
30
+ if "tests" not in raw:
31
+ raise ValueError(f"{file}: Missing required 'tests' field")
32
+
33
+ prompt_template = raw["prompt"]
34
+ model = raw.get("model", "claude-sonnet-4-20250514")
35
+ system = raw.get("system", None)
36
+ temperature = raw.get("temperature", 0)
37
+ max_tokens = raw.get("max_tokens", 1024)
38
+
39
+ tests = []
40
+ for i, test_raw in enumerate(raw["tests"]):
41
+ test = _validate_test(test_raw, file, i)
42
+ tests.append(test)
43
+
44
+ return {
45
+ "file": str(file),
46
+ "prompt_template": prompt_template,
47
+ "model": model,
48
+ "system": system,
49
+ "temperature": temperature,
50
+ "max_tokens": max_tokens,
51
+ "tests": tests,
52
+ }
53
+
54
+
55
+ def _validate_test(test: dict, file: Path, index: int) -> dict:
56
+ """Validate a single test case."""
57
+ if not isinstance(test, dict):
58
+ raise ValueError(f"{file}: Test {index} must be a mapping")
59
+
60
+ name = test.get("name", f"test_{index}")
61
+ variables = test.get("vars", {})
62
+ assertions = test.get("assert", [])
63
+
64
+ if not assertions:
65
+ raise ValueError(f"{file}: Test '{name}' has no assertions")
66
+
67
+ validated_assertions = []
68
+ for a in assertions:
69
+ if "type" not in a:
70
+ raise ValueError(f"{file}: Test '{name}' has assertion without 'type'")
71
+ validated_assertions.append({
72
+ "type": a["type"],
73
+ "value": a.get("value"),
74
+ })
75
+
76
+ return {
77
+ "name": name,
78
+ "vars": variables,
79
+ "assert": validated_assertions,
80
+ }
81
+
82
+
83
+ def render_prompt(template: str, variables: dict) -> str:
84
+ """Render a prompt template with variables using {{ var }} syntax."""
85
+ result = template
86
+ for key, value in variables.items():
87
+ result = result.replace("{{ " + key + " }}", str(value))
88
+ result = result.replace("{{" + key + "}}", str(value))
89
+ return result
promptlab/providers.py ADDED
@@ -0,0 +1,137 @@
1
+ """LLM API providers: Claude and OpenAI."""
2
+
3
+ import os
4
+
5
+ import httpx
6
+
7
+
8
+ ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
9
+ OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
10
+
11
+
12
+ def _is_anthropic_model(model: str) -> bool:
13
+ return model.startswith("claude")
14
+
15
+
16
+ def _is_openai_model(model: str) -> bool:
17
+ return model.startswith("gpt") or model.startswith("o1") or model.startswith("o3")
18
+
19
+
20
+ def call_llm(
21
+ prompt: str,
22
+ model: str,
23
+ system: str | None = None,
24
+ temperature: float = 0,
25
+ max_tokens: int = 1024,
26
+ ) -> dict:
27
+ """Call an LLM and return the response.
28
+
29
+ Returns:
30
+ dict with keys: text, model, input_tokens, output_tokens
31
+ """
32
+ if _is_anthropic_model(model):
33
+ return _call_anthropic(prompt, model, system, temperature, max_tokens)
34
+ elif _is_openai_model(model):
35
+ return _call_openai(prompt, model, system, temperature, max_tokens)
36
+ else:
37
+ raise ValueError(
38
+ f"Unknown model: {model}. "
39
+ "Supported prefixes: 'claude' (Anthropic), 'gpt'/'o1'/'o3' (OpenAI)"
40
+ )
41
+
42
+
43
+ def _call_anthropic(
44
+ prompt: str,
45
+ model: str,
46
+ system: str | None,
47
+ temperature: float,
48
+ max_tokens: int,
49
+ ) -> dict:
50
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
51
+ if not api_key:
52
+ raise ValueError(
53
+ "ANTHROPIC_API_KEY environment variable is not set. "
54
+ "Get a key at https://console.anthropic.com/"
55
+ )
56
+
57
+ headers = {
58
+ "x-api-key": api_key,
59
+ "content-type": "application/json",
60
+ "anthropic-version": "2023-06-01",
61
+ }
62
+
63
+ body: dict = {
64
+ "model": model,
65
+ "max_tokens": max_tokens,
66
+ "temperature": temperature,
67
+ "messages": [{"role": "user", "content": prompt}],
68
+ }
69
+ if system:
70
+ body["system"] = system
71
+
72
+ with httpx.Client(timeout=60) as client:
73
+ resp = client.post(ANTHROPIC_API_URL, headers=headers, json=body)
74
+
75
+ if resp.status_code != 200:
76
+ raise RuntimeError(f"Anthropic API error ({resp.status_code}): {resp.text}")
77
+
78
+ data = resp.json()
79
+ text = ""
80
+ for block in data.get("content", []):
81
+ if block.get("type") == "text":
82
+ text += block.get("text", "")
83
+
84
+ return {
85
+ "text": text,
86
+ "model": data.get("model", model),
87
+ "input_tokens": data.get("usage", {}).get("input_tokens", 0),
88
+ "output_tokens": data.get("usage", {}).get("output_tokens", 0),
89
+ }
90
+
91
+
92
+ def _call_openai(
93
+ prompt: str,
94
+ model: str,
95
+ system: str | None,
96
+ temperature: float,
97
+ max_tokens: int,
98
+ ) -> dict:
99
+ api_key = os.environ.get("OPENAI_API_KEY")
100
+ if not api_key:
101
+ raise ValueError(
102
+ "OPENAI_API_KEY environment variable is not set. "
103
+ "Get a key at https://platform.openai.com/"
104
+ )
105
+
106
+ headers = {
107
+ "Authorization": f"Bearer {api_key}",
108
+ "Content-Type": "application/json",
109
+ }
110
+
111
+ messages = []
112
+ if system:
113
+ messages.append({"role": "system", "content": system})
114
+ messages.append({"role": "user", "content": prompt})
115
+
116
+ body = {
117
+ "model": model,
118
+ "messages": messages,
119
+ "temperature": temperature,
120
+ "max_tokens": max_tokens,
121
+ }
122
+
123
+ with httpx.Client(timeout=60) as client:
124
+ resp = client.post(OPENAI_API_URL, headers=headers, json=body)
125
+
126
+ if resp.status_code != 200:
127
+ raise RuntimeError(f"OpenAI API error ({resp.status_code}): {resp.text}")
128
+
129
+ data = resp.json()
130
+ text = data["choices"][0]["message"]["content"]
131
+
132
+ return {
133
+ "text": text,
134
+ "model": data.get("model", model),
135
+ "input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
136
+ "output_tokens": data.get("usage", {}).get("completion_tokens", 0),
137
+ }
promptlab/reporter.py ADDED
@@ -0,0 +1,69 @@
1
+ """Reporter: pretty terminal output for test results."""
2
+
3
+ from rich.console import Console
4
+ from rich.table import Table
5
+
6
+ console = Console()
7
+
8
+
9
+ def print_results(results: list[dict], verbose: bool = False):
10
+ """Print individual test results."""
11
+ console.print()
12
+
13
+ for result in results:
14
+ status = "[bold green]✅ PASS[/]" if result["passed"] else "[bold red]❌ FAIL[/]"
15
+ name = f"{result['suite']} :: {result['name']}"
16
+ elapsed = f"({result['elapsed']}s)"
17
+
18
+ console.print(f" {status} {name} [dim]{elapsed}[/]")
19
+
20
+ if result["error"]:
21
+ console.print(f" [red]Error: {result['error']}[/]")
22
+
23
+ if not result["passed"]:
24
+ for assertion in result["assertions"]:
25
+ if not assertion["passed"]:
26
+ console.print(f" [dim]Expected:[/] {assertion['expected']}")
27
+ if assertion.get("message"):
28
+ console.print(f" [dim]Reason:[/] {assertion['message']}")
29
+
30
+ if verbose and result["output"]:
31
+ console.print(f" [dim]Output:[/]")
32
+ # Indent and truncate output
33
+ output_lines = result["output"][:500].split("\n")
34
+ for line in output_lines:
35
+ console.print(f" [dim]{line}[/]")
36
+ if len(result["output"]) > 500:
37
+ console.print(f" [dim]... (truncated)[/]")
38
+ console.print()
39
+
40
+
41
+ def print_summary(results: list[dict], elapsed: float):
42
+ """Print summary line."""
43
+ total = len(results)
44
+ passed = sum(1 for r in results if r["passed"])
45
+ failed = total - passed
46
+
47
+ total_input = sum(r.get("input_tokens", 0) for r in results)
48
+ total_output = sum(r.get("output_tokens", 0) for r in results)
49
+
50
+ console.print()
51
+ console.print("━" * 50)
52
+
53
+ if failed == 0:
54
+ console.print(
55
+ f" [bold green]Results: {passed} passed[/], "
56
+ f"{total} total ({elapsed:.1f}s)"
57
+ )
58
+ else:
59
+ console.print(
60
+ f" [bold red]Results: {passed} passed, {failed} failed[/], "
61
+ f"{total} total ({elapsed:.1f}s)"
62
+ )
63
+
64
+ if total_input > 0 or total_output > 0:
65
+ console.print(
66
+ f" [dim]Tokens: {total_input:,} input, {total_output:,} output[/]"
67
+ )
68
+
69
+ console.print()
promptlab/runner.py ADDED
@@ -0,0 +1,85 @@
1
+ """Test runner: executes test suites and collects results."""
2
+
3
+ import time
4
+
5
+ from promptlab.assertions import check_assertion
6
+ from promptlab.loader import render_prompt
7
+ from promptlab.providers import call_llm
8
+
9
+
10
+ def run_all_tests(suites: list[dict], verbose: bool = False) -> list[dict]:
11
+ """Run all test suites and return results."""
12
+ results = []
13
+
14
+ for suite in suites:
15
+ suite_results = _run_suite(suite, verbose=verbose)
16
+ results.extend(suite_results)
17
+
18
+ return results
19
+
20
+
21
+ def _run_suite(suite: dict, verbose: bool = False) -> list[dict]:
22
+ """Run a single test suite."""
23
+ results = []
24
+ file_name = suite["file"]
25
+ # Use the filename without extension as the suite name
26
+ suite_name = file_name.rsplit("/", 1)[-1].rsplit(".", 1)[0]
27
+
28
+ for test in suite["tests"]:
29
+ result = _run_single_test(suite, test, suite_name, verbose=verbose)
30
+ results.append(result)
31
+
32
+ return results
33
+
34
+
35
+ def _run_single_test(suite: dict, test: dict, suite_name: str, verbose: bool = False) -> dict:
36
+ """Run a single test case."""
37
+ test_name = test["name"]
38
+ start = time.time()
39
+
40
+ # Render the prompt with variables
41
+ prompt = render_prompt(suite["prompt_template"], test["vars"])
42
+
43
+ # Call the LLM
44
+ try:
45
+ response = call_llm(
46
+ prompt=prompt,
47
+ model=suite["model"],
48
+ system=suite.get("system"),
49
+ temperature=suite.get("temperature", 0),
50
+ max_tokens=suite.get("max_tokens", 1024),
51
+ )
52
+ output = response["text"]
53
+ error = None
54
+ except Exception as e:
55
+ output = ""
56
+ error = str(e)
57
+ response = {"input_tokens": 0, "output_tokens": 0, "model": suite["model"]}
58
+
59
+ elapsed = time.time() - start
60
+
61
+ # Check assertions
62
+ assertion_results = []
63
+ all_passed = error is None
64
+
65
+ if error is None:
66
+ for assertion in test["assert"]:
67
+ result = check_assertion(assertion, output, suite["model"])
68
+ assertion_results.append(result)
69
+ if not result["passed"]:
70
+ all_passed = False
71
+ else:
72
+ all_passed = False
73
+
74
+ return {
75
+ "suite": suite_name,
76
+ "name": test_name,
77
+ "passed": all_passed,
78
+ "elapsed": round(elapsed, 2),
79
+ "output": output,
80
+ "error": error,
81
+ "assertions": assertion_results,
82
+ "model": response.get("model", suite["model"]),
83
+ "input_tokens": response.get("input_tokens", 0),
84
+ "output_tokens": response.get("output_tokens", 0),
85
+ }
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.4
2
+ Name: promptlab-cli
3
+ Version: 0.1.0
4
+ Summary: Automated testing for LLM prompts. Like pytest, but for prompts.
5
+ Project-URL: Homepage, https://github.com/vigp17/promptlab
6
+ Project-URL: Repository, https://github.com/vigp17/promptlab
7
+ Project-URL: Issues, https://github.com/vigp17/promptlab/issues
8
+ Author: Vignesh Pai
9
+ License-Expression: MIT
10
+ Keywords: ai,claude,evaluation,llm,openai,prompts,testing
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Software Development :: Testing
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: click>=8.0
19
+ Requires-Dist: httpx>=0.24.0
20
+ Requires-Dist: pyyaml>=6.0
21
+ Requires-Dist: rich>=13.0
22
+ Provides-Extra: dev
23
+ Requires-Dist: pytest>=7.0; extra == 'dev'
24
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # promptlab ⚡
28
+
29
+ Automated testing for LLM prompts. Write test cases in YAML, run them against Claude or OpenAI, get pass/fail results in your terminal.
30
+
31
+ **Like pytest, but for prompts.**
32
+
33
+ ```bash
34
+ pip install promptlab-cli
35
+ promptlab run tests/
36
+ ```
37
+
38
+ ```
39
+ ✅ summarize_article :: returns_short_summary PASS (1.2s)
40
+ ✅ summarize_article :: mentions_key_points PASS (1.1s)
41
+ ❌ translate_text :: preserves_tone FAIL (0.9s)
42
+ Expected: contains "formal"
43
+ Got: "Here is the translated text in a casual style..."
44
+
45
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
46
+ Results: 2 passed, 1 failed, 3 total (3.2s)
47
+ ```
48
+
49
+ ## Why?
50
+
51
+ You're building an app with Claude or GPT. Your prompt works today. Tomorrow you tweak it and something breaks. You don't notice until a user complains.
52
+
53
+ **promptlab catches prompt regressions before they ship.** Define what good output looks like, run tests on every change, and know immediately if something broke.
54
+
55
+ ## Quickstart
56
+
57
+ ### Install
58
+
59
+ ```bash
60
+ pip install promptlab-cli
61
+ ```
62
+
63
+ ### Set your API key
64
+
65
+ ```bash
66
+ export ANTHROPIC_API_KEY=sk-ant-...
67
+ # or
68
+ export OPENAI_API_KEY=sk-...
69
+ ```
70
+
71
+ ### Write a test file
72
+
73
+ Create `tests/summarize.yaml`:
74
+
75
+ ```yaml
76
+ prompt: |
77
+ Summarize this article in 2-3 sentences:
78
+ {{ article }}
79
+
80
+ model: claude-sonnet-4-20250514
81
+
82
+ tests:
83
+ - name: short_summary
84
+ vars:
85
+ article: |
86
+ The Federal Reserve held interest rates steady on Wednesday,
87
+ keeping the benchmark rate in the 5.25%-5.50% range. Chair
88
+ Jerome Powell said the committee needs more confidence that
89
+ inflation is moving toward the 2% target before cutting rates.
90
+ assert:
91
+ - type: max_tokens
92
+ value: 100
93
+ - type: contains
94
+ value: "Federal Reserve"
95
+ - type: contains
96
+ value: "interest rate"
97
+
98
+ - name: handles_empty_input
99
+ vars:
100
+ article: ""
101
+ assert:
102
+ - type: not_contains
103
+ value: "error"
104
+ - type: min_length
105
+ value: 10
106
+ ```
107
+
108
+ ### Run it
109
+
110
+ ```bash
111
+ promptlab run tests/
112
+ ```
113
+
114
+ ## Test File Format
115
+
116
+ Each `.yaml` file defines a prompt and its test cases:
117
+
118
+ ```yaml
119
+ # The prompt template. Use {{ variable }} for inputs.
120
+ prompt: |
121
+ You are a helpful assistant. {{ instruction }}
122
+
123
+ # Which model to use
124
+ model: claude-sonnet-4-20250514 # or gpt-4o, claude-haiku-4-5-20251001, etc.
125
+
126
+ # Optional system prompt
127
+ system: "You are a concise technical writer."
128
+
129
+ # Optional model parameters
130
+ temperature: 0
131
+ max_tokens: 500
132
+
133
+ # Test cases
134
+ tests:
135
+ - name: test_name
136
+ vars:
137
+ instruction: "Explain what a CPU does in one sentence."
138
+ assert:
139
+ - type: contains
140
+ value: "processor"
141
+ - type: max_tokens
142
+ value: 50
143
+ ```
144
+
145
+ ## Assertion Types
146
+
147
+ | Type | Description | Example |
148
+ |---|---|---|
149
+ | `contains` | Output must contain this string (case-insensitive) | `value: "machine learning"` |
150
+ | `not_contains` | Output must NOT contain this string | `value: "I'm sorry"` |
151
+ | `starts_with` | Output must start with this string | `value: "Sure"` |
152
+ | `regex` | Output must match this regex pattern | `value: "\\d{4}"` |
153
+ | `max_tokens` | Output must be at most N tokens | `value: 100` |
154
+ | `min_length` | Output must be at least N characters | `value: 50` |
155
+ | `max_length` | Output must be at most N characters | `value: 500` |
156
+ | `equals` | Output must exactly equal this string | `value: "42"` |
157
+ | `llm_judge` | Ask another LLM to evaluate the output | `value: "Is this response helpful and accurate?"` |
158
+
159
+ ## LLM-as-Judge
160
+
161
+ The most powerful assertion type. Uses a second LLM call to evaluate output quality:
162
+
163
+ ```yaml
164
+ tests:
165
+ - name: helpful_response
166
+ vars:
167
+ question: "How do I fix a memory leak in Python?"
168
+ assert:
169
+ - type: llm_judge
170
+ value: "Does this response provide specific, actionable debugging steps? Answer YES or NO."
171
+ ```
172
+
173
+ ## Supported Models
174
+
175
+ **Anthropic (Claude):**
176
+ - `claude-sonnet-4-20250514`
177
+ - `claude-haiku-4-5-20251001`
178
+ - Set `ANTHROPIC_API_KEY` environment variable
179
+
180
+ **OpenAI:**
181
+ - `gpt-4o`
182
+ - `gpt-4o-mini`
183
+ - Set `OPENAI_API_KEY` environment variable
184
+
185
+ ## CLI Commands
186
+
187
+ ```bash
188
+ # Run all test files in a directory
189
+ promptlab run tests/
190
+
191
+ # Run a single test file
192
+ promptlab run tests/summarize.yaml
193
+
194
+ # Verbose output (show full LLM responses)
195
+ promptlab run tests/ --verbose
196
+
197
+ # Output results as JSON
198
+ promptlab run tests/ --json
199
+
200
+ # Dry run (show what would be tested without calling APIs)
201
+ promptlab run tests/ --dry-run
202
+ ```
203
+
204
+ ## Use Cases
205
+
206
+ - **Prompt regression testing** — Run tests in CI/CD to catch regressions
207
+ - **Prompt comparison** — Test the same cases across different models
208
+ - **Guard rails validation** — Verify your prompt rejects harmful inputs
209
+ - **Output format checking** — Ensure structured output matches expectations
210
+
211
+ ## Development
212
+
213
+ ```bash
214
+ git clone https://github.com/vigp17/promptlab.git
215
+ cd promptlab
216
+ pip install -e ".[dev]"
217
+ pytest
218
+ ```
219
+
220
+ ## Roadmap
221
+
222
+ - [x] YAML test definitions
223
+ - [x] Claude and OpenAI support
224
+ - [x] 9 assertion types including LLM-as-judge
225
+ - [x] CLI with colored output
226
+ - [ ] Cost tracking per test run
227
+ - [ ] HTML report generation
228
+ - [ ] Parallel test execution
229
+ - [ ] GitHub Actions integration
230
+ - [ ] Prompt versioning and diff
231
+ - [ ] Custom scoring functions
232
+
233
+ ## License
234
+
235
+ MIT
@@ -0,0 +1,11 @@
1
+ promptlab/__init__.py,sha256=os5p2uKFDJzFm1RuGX_yAwXOc6Jktvbtd79J9voVwPE,75
2
+ promptlab/assertions.py,sha256=AD9CWAhktSJkh_4xtdKUipm3_PrTbmldxJ_iABsYWZw,5348
3
+ promptlab/cli.py,sha256=_c00XcyyzmKs-5t-c6XwB5MJKn13gCe6kjgnuR2ricA,2452
4
+ promptlab/loader.py,sha256=sg0aU0HA5OSPpl-cGWvUKCgPBwM5kfsK1xJltDCRmWQ,2579
5
+ promptlab/providers.py,sha256=C6aRNyieptpGNa4wylUtsJgGxBnwE5Fngnk_-WwZOfQ,3743
6
+ promptlab/reporter.py,sha256=QXjhWJpX5CtzH9tKAcAoI716bMfV998OQQP5lvm8e0M,2329
7
+ promptlab/runner.py,sha256=wogLZ4K2hyURl6vCG-SS3czfIMElWZmgePpYKeXtUMs,2520
8
+ promptlab_cli-0.1.0.dist-info/METADATA,sha256=9FnTyN5w9e_-AESCS2-ogLzn_WwH72dn797lIYGlRoU,6303
9
+ promptlab_cli-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
10
+ promptlab_cli-0.1.0.dist-info/entry_points.txt,sha256=-_chAvLXxNGzv9TkKKVTmZLMe9rBMgu7qPXxuU4Io1Y,49
11
+ promptlab_cli-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ promptlab = promptlab.cli:main