promptlab-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptlab/__init__.py +3 -0
- promptlab/assertions.py +182 -0
- promptlab/cli.py +81 -0
- promptlab/loader.py +89 -0
- promptlab/providers.py +137 -0
- promptlab/reporter.py +69 -0
- promptlab/runner.py +85 -0
- promptlab_cli-0.1.0.dist-info/METADATA +235 -0
- promptlab_cli-0.1.0.dist-info/RECORD +11 -0
- promptlab_cli-0.1.0.dist-info/WHEEL +4 -0
- promptlab_cli-0.1.0.dist-info/entry_points.txt +2 -0
promptlab/__init__.py
ADDED
promptlab/assertions.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Assertion types for evaluating LLM outputs."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from promptlab.providers import call_llm
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_assertion(assertion: dict, output: str, model: str) -> dict:
|
|
9
|
+
"""Check a single assertion against an LLM output.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
dict with keys: passed (bool), type, expected, got, message
|
|
13
|
+
"""
|
|
14
|
+
atype = assertion["type"]
|
|
15
|
+
value = assertion.get("value")
|
|
16
|
+
|
|
17
|
+
checkers = {
|
|
18
|
+
"contains": _check_contains,
|
|
19
|
+
"not_contains": _check_not_contains,
|
|
20
|
+
"starts_with": _check_starts_with,
|
|
21
|
+
"regex": _check_regex,
|
|
22
|
+
"equals": _check_equals,
|
|
23
|
+
"max_tokens": _check_max_tokens,
|
|
24
|
+
"min_length": _check_min_length,
|
|
25
|
+
"max_length": _check_max_length,
|
|
26
|
+
"llm_judge": _check_llm_judge,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
checker = checkers.get(atype)
|
|
30
|
+
if checker is None:
|
|
31
|
+
return {
|
|
32
|
+
"passed": False,
|
|
33
|
+
"type": atype,
|
|
34
|
+
"expected": value,
|
|
35
|
+
"got": None,
|
|
36
|
+
"message": f"Unknown assertion type: {atype}",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if atype == "llm_judge":
|
|
40
|
+
return checker(output, value, model)
|
|
41
|
+
return checker(output, value)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _check_contains(output: str, value: str) -> dict:
|
|
45
|
+
passed = value.lower() in output.lower()
|
|
46
|
+
return {
|
|
47
|
+
"passed": passed,
|
|
48
|
+
"type": "contains",
|
|
49
|
+
"expected": f'contains "{value}"',
|
|
50
|
+
"got": output[:200] + "..." if len(output) > 200 else output,
|
|
51
|
+
"message": "" if passed else f'Output does not contain "{value}"',
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _check_not_contains(output: str, value: str) -> dict:
|
|
56
|
+
passed = value.lower() not in output.lower()
|
|
57
|
+
return {
|
|
58
|
+
"passed": passed,
|
|
59
|
+
"type": "not_contains",
|
|
60
|
+
"expected": f'does not contain "{value}"',
|
|
61
|
+
"got": output[:200] + "..." if len(output) > 200 else output,
|
|
62
|
+
"message": "" if passed else f'Output contains "{value}" (should not)',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _check_starts_with(output: str, value: str) -> dict:
|
|
67
|
+
passed = output.strip().lower().startswith(value.lower())
|
|
68
|
+
return {
|
|
69
|
+
"passed": passed,
|
|
70
|
+
"type": "starts_with",
|
|
71
|
+
"expected": f'starts with "{value}"',
|
|
72
|
+
"got": output[:100],
|
|
73
|
+
"message": "" if passed else f'Output does not start with "{value}"',
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _check_regex(output: str, value: str) -> dict:
|
|
78
|
+
try:
|
|
79
|
+
passed = bool(re.search(value, output))
|
|
80
|
+
except re.error as e:
|
|
81
|
+
return {
|
|
82
|
+
"passed": False,
|
|
83
|
+
"type": "regex",
|
|
84
|
+
"expected": f"matches /{value}/",
|
|
85
|
+
"got": None,
|
|
86
|
+
"message": f"Invalid regex: {e}",
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
"passed": passed,
|
|
90
|
+
"type": "regex",
|
|
91
|
+
"expected": f"matches /{value}/",
|
|
92
|
+
"got": output[:200] + "..." if len(output) > 200 else output,
|
|
93
|
+
"message": "" if passed else f"Output does not match regex /{value}/",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _check_equals(output: str, value: str) -> dict:
|
|
98
|
+
passed = output.strip() == value.strip()
|
|
99
|
+
return {
|
|
100
|
+
"passed": passed,
|
|
101
|
+
"type": "equals",
|
|
102
|
+
"expected": value,
|
|
103
|
+
"got": output.strip(),
|
|
104
|
+
"message": "" if passed else "Output does not exactly match expected value",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _check_max_tokens(output: str, value: int) -> dict:
|
|
109
|
+
# Rough token estimate: ~4 chars per token
|
|
110
|
+
token_estimate = len(output.split())
|
|
111
|
+
passed = token_estimate <= value
|
|
112
|
+
return {
|
|
113
|
+
"passed": passed,
|
|
114
|
+
"type": "max_tokens",
|
|
115
|
+
"expected": f"<= {value} tokens",
|
|
116
|
+
"got": f"~{token_estimate} tokens ({len(output)} chars)",
|
|
117
|
+
"message": "" if passed else f"Output has ~{token_estimate} tokens, max is {value}",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _check_min_length(output: str, value: int) -> dict:
|
|
122
|
+
length = len(output.strip())
|
|
123
|
+
passed = length >= value
|
|
124
|
+
return {
|
|
125
|
+
"passed": passed,
|
|
126
|
+
"type": "min_length",
|
|
127
|
+
"expected": f">= {value} chars",
|
|
128
|
+
"got": f"{length} chars",
|
|
129
|
+
"message": "" if passed else f"Output is {length} chars, minimum is {value}",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _check_max_length(output: str, value: int) -> dict:
|
|
134
|
+
length = len(output.strip())
|
|
135
|
+
passed = length <= value
|
|
136
|
+
return {
|
|
137
|
+
"passed": passed,
|
|
138
|
+
"type": "max_length",
|
|
139
|
+
"expected": f"<= {value} chars",
|
|
140
|
+
"got": f"{length} chars",
|
|
141
|
+
"message": "" if passed else f"Output is {length} chars, maximum is {value}",
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _check_llm_judge(output: str, criteria: str, model: str) -> dict:
|
|
146
|
+
"""Use an LLM to judge the output quality."""
|
|
147
|
+
judge_prompt = f"""You are evaluating an LLM output. Answer only YES or NO.
|
|
148
|
+
|
|
149
|
+
Criteria: {criteria}
|
|
150
|
+
|
|
151
|
+
Output to evaluate:
|
|
152
|
+
---
|
|
153
|
+
{output}
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
Does the output meet the criteria? Answer only YES or NO."""
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
response = call_llm(
|
|
160
|
+
prompt=judge_prompt,
|
|
161
|
+
model=model,
|
|
162
|
+
temperature=0,
|
|
163
|
+
max_tokens=10,
|
|
164
|
+
)
|
|
165
|
+
answer = response["text"].strip().upper()
|
|
166
|
+
passed = answer.startswith("YES")
|
|
167
|
+
except Exception as e:
|
|
168
|
+
return {
|
|
169
|
+
"passed": False,
|
|
170
|
+
"type": "llm_judge",
|
|
171
|
+
"expected": criteria,
|
|
172
|
+
"got": f"Judge error: {e}",
|
|
173
|
+
"message": f"LLM judge failed: {e}",
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
"passed": passed,
|
|
178
|
+
"type": "llm_judge",
|
|
179
|
+
"expected": criteria,
|
|
180
|
+
"got": f"Judge answered: {answer}",
|
|
181
|
+
"message": "" if passed else f"LLM judge said NO to: {criteria}",
|
|
182
|
+
}
|
promptlab/cli.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""CLI entry point for promptlab."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
|
|
10
|
+
from promptlab.loader import load_test_files
|
|
11
|
+
from promptlab.runner import run_all_tests
|
|
12
|
+
from promptlab.reporter import print_results, print_summary
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group()
|
|
16
|
+
@click.version_option(version="0.1.0")
|
|
17
|
+
def main():
|
|
18
|
+
"""promptlab — Automated testing for LLM prompts."""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@main.command()
|
|
23
|
+
@click.argument("path", type=click.Path(exists=True))
|
|
24
|
+
@click.option("--verbose", "-v", is_flag=True, help="Show full LLM responses")
|
|
25
|
+
@click.option("--json-output", "--json", "json_out", is_flag=True, help="Output results as JSON")
|
|
26
|
+
@click.option("--dry-run", is_flag=True, help="Show tests without calling APIs")
|
|
27
|
+
def run(path: str, verbose: bool, json_out: bool, dry_run: bool):
|
|
28
|
+
"""Run prompt tests from a file or directory."""
|
|
29
|
+
target = Path(path)
|
|
30
|
+
|
|
31
|
+
# Collect test files
|
|
32
|
+
if target.is_file():
|
|
33
|
+
files = [target]
|
|
34
|
+
elif target.is_dir():
|
|
35
|
+
files = sorted(target.glob("**/*.yaml")) + sorted(target.glob("**/*.yml"))
|
|
36
|
+
else:
|
|
37
|
+
click.echo(f"Error: {path} is not a file or directory", err=True)
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
if not files:
|
|
41
|
+
click.echo(f"No .yaml or .yml files found in {path}", err=True)
|
|
42
|
+
sys.exit(1)
|
|
43
|
+
|
|
44
|
+
# Load test suites
|
|
45
|
+
suites = load_test_files(files)
|
|
46
|
+
|
|
47
|
+
if dry_run:
|
|
48
|
+
for suite in suites:
|
|
49
|
+
click.echo(f"\n📄 {suite['file']}")
|
|
50
|
+
click.echo(f" Model: {suite['model']}")
|
|
51
|
+
click.echo(f" Tests: {len(suite['tests'])}")
|
|
52
|
+
for test in suite["tests"]:
|
|
53
|
+
assertions = ", ".join(a["type"] for a in test["assert"])
|
|
54
|
+
click.echo(f" - {test['name']} [{assertions}]")
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# Run tests
|
|
58
|
+
start = time.time()
|
|
59
|
+
results = run_all_tests(suites, verbose=verbose)
|
|
60
|
+
elapsed = time.time() - start
|
|
61
|
+
|
|
62
|
+
if json_out:
|
|
63
|
+
output = {
|
|
64
|
+
"results": results,
|
|
65
|
+
"elapsed_seconds": round(elapsed, 2),
|
|
66
|
+
"total": len(results),
|
|
67
|
+
"passed": sum(1 for r in results if r["passed"]),
|
|
68
|
+
"failed": sum(1 for r in results if not r["passed"]),
|
|
69
|
+
}
|
|
70
|
+
click.echo(json.dumps(output, indent=2))
|
|
71
|
+
else:
|
|
72
|
+
print_results(results, verbose=verbose)
|
|
73
|
+
print_summary(results, elapsed)
|
|
74
|
+
|
|
75
|
+
# Exit with non-zero if any test failed
|
|
76
|
+
if any(not r["passed"] for r in results):
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
main()
|
promptlab/loader.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Load and validate YAML test files."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_test_files(files: list[Path]) -> list[dict]:
|
|
11
|
+
"""Load and validate test suites from YAML files."""
|
|
12
|
+
suites = []
|
|
13
|
+
for file in files:
|
|
14
|
+
suite = _load_single_file(file)
|
|
15
|
+
suites.append(suite)
|
|
16
|
+
return suites
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _load_single_file(file: Path) -> dict:
|
|
20
|
+
"""Load a single YAML test file."""
|
|
21
|
+
with open(file) as f:
|
|
22
|
+
raw = yaml.safe_load(f)
|
|
23
|
+
|
|
24
|
+
if not isinstance(raw, dict):
|
|
25
|
+
raise ValueError(f"{file}: Expected a YAML mapping at the top level")
|
|
26
|
+
|
|
27
|
+
# Required fields
|
|
28
|
+
if "prompt" not in raw:
|
|
29
|
+
raise ValueError(f"{file}: Missing required 'prompt' field")
|
|
30
|
+
if "tests" not in raw:
|
|
31
|
+
raise ValueError(f"{file}: Missing required 'tests' field")
|
|
32
|
+
|
|
33
|
+
prompt_template = raw["prompt"]
|
|
34
|
+
model = raw.get("model", "claude-sonnet-4-20250514")
|
|
35
|
+
system = raw.get("system", None)
|
|
36
|
+
temperature = raw.get("temperature", 0)
|
|
37
|
+
max_tokens = raw.get("max_tokens", 1024)
|
|
38
|
+
|
|
39
|
+
tests = []
|
|
40
|
+
for i, test_raw in enumerate(raw["tests"]):
|
|
41
|
+
test = _validate_test(test_raw, file, i)
|
|
42
|
+
tests.append(test)
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
"file": str(file),
|
|
46
|
+
"prompt_template": prompt_template,
|
|
47
|
+
"model": model,
|
|
48
|
+
"system": system,
|
|
49
|
+
"temperature": temperature,
|
|
50
|
+
"max_tokens": max_tokens,
|
|
51
|
+
"tests": tests,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _validate_test(test: dict, file: Path, index: int) -> dict:
|
|
56
|
+
"""Validate a single test case."""
|
|
57
|
+
if not isinstance(test, dict):
|
|
58
|
+
raise ValueError(f"{file}: Test {index} must be a mapping")
|
|
59
|
+
|
|
60
|
+
name = test.get("name", f"test_{index}")
|
|
61
|
+
variables = test.get("vars", {})
|
|
62
|
+
assertions = test.get("assert", [])
|
|
63
|
+
|
|
64
|
+
if not assertions:
|
|
65
|
+
raise ValueError(f"{file}: Test '{name}' has no assertions")
|
|
66
|
+
|
|
67
|
+
validated_assertions = []
|
|
68
|
+
for a in assertions:
|
|
69
|
+
if "type" not in a:
|
|
70
|
+
raise ValueError(f"{file}: Test '{name}' has assertion without 'type'")
|
|
71
|
+
validated_assertions.append({
|
|
72
|
+
"type": a["type"],
|
|
73
|
+
"value": a.get("value"),
|
|
74
|
+
})
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
"name": name,
|
|
78
|
+
"vars": variables,
|
|
79
|
+
"assert": validated_assertions,
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def render_prompt(template: str, variables: dict) -> str:
|
|
84
|
+
"""Render a prompt template with variables using {{ var }} syntax."""
|
|
85
|
+
result = template
|
|
86
|
+
for key, value in variables.items():
|
|
87
|
+
result = result.replace("{{ " + key + " }}", str(value))
|
|
88
|
+
result = result.replace("{{" + key + "}}", str(value))
|
|
89
|
+
return result
|
promptlab/providers.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""LLM API providers: Claude and OpenAI."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
|
|
9
|
+
OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _is_anthropic_model(model: str) -> bool:
|
|
13
|
+
return model.startswith("claude")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _is_openai_model(model: str) -> bool:
|
|
17
|
+
return model.startswith("gpt") or model.startswith("o1") or model.startswith("o3")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def call_llm(
|
|
21
|
+
prompt: str,
|
|
22
|
+
model: str,
|
|
23
|
+
system: str | None = None,
|
|
24
|
+
temperature: float = 0,
|
|
25
|
+
max_tokens: int = 1024,
|
|
26
|
+
) -> dict:
|
|
27
|
+
"""Call an LLM and return the response.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
dict with keys: text, model, input_tokens, output_tokens
|
|
31
|
+
"""
|
|
32
|
+
if _is_anthropic_model(model):
|
|
33
|
+
return _call_anthropic(prompt, model, system, temperature, max_tokens)
|
|
34
|
+
elif _is_openai_model(model):
|
|
35
|
+
return _call_openai(prompt, model, system, temperature, max_tokens)
|
|
36
|
+
else:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"Unknown model: {model}. "
|
|
39
|
+
"Supported prefixes: 'claude' (Anthropic), 'gpt'/'o1'/'o3' (OpenAI)"
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _call_anthropic(
|
|
44
|
+
prompt: str,
|
|
45
|
+
model: str,
|
|
46
|
+
system: str | None,
|
|
47
|
+
temperature: float,
|
|
48
|
+
max_tokens: int,
|
|
49
|
+
) -> dict:
|
|
50
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
51
|
+
if not api_key:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"ANTHROPIC_API_KEY environment variable is not set. "
|
|
54
|
+
"Get a key at https://console.anthropic.com/"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
headers = {
|
|
58
|
+
"x-api-key": api_key,
|
|
59
|
+
"content-type": "application/json",
|
|
60
|
+
"anthropic-version": "2023-06-01",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
body: dict = {
|
|
64
|
+
"model": model,
|
|
65
|
+
"max_tokens": max_tokens,
|
|
66
|
+
"temperature": temperature,
|
|
67
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
68
|
+
}
|
|
69
|
+
if system:
|
|
70
|
+
body["system"] = system
|
|
71
|
+
|
|
72
|
+
with httpx.Client(timeout=60) as client:
|
|
73
|
+
resp = client.post(ANTHROPIC_API_URL, headers=headers, json=body)
|
|
74
|
+
|
|
75
|
+
if resp.status_code != 200:
|
|
76
|
+
raise RuntimeError(f"Anthropic API error ({resp.status_code}): {resp.text}")
|
|
77
|
+
|
|
78
|
+
data = resp.json()
|
|
79
|
+
text = ""
|
|
80
|
+
for block in data.get("content", []):
|
|
81
|
+
if block.get("type") == "text":
|
|
82
|
+
text += block.get("text", "")
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
"text": text,
|
|
86
|
+
"model": data.get("model", model),
|
|
87
|
+
"input_tokens": data.get("usage", {}).get("input_tokens", 0),
|
|
88
|
+
"output_tokens": data.get("usage", {}).get("output_tokens", 0),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _call_openai(
|
|
93
|
+
prompt: str,
|
|
94
|
+
model: str,
|
|
95
|
+
system: str | None,
|
|
96
|
+
temperature: float,
|
|
97
|
+
max_tokens: int,
|
|
98
|
+
) -> dict:
|
|
99
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
100
|
+
if not api_key:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"OPENAI_API_KEY environment variable is not set. "
|
|
103
|
+
"Get a key at https://platform.openai.com/"
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
headers = {
|
|
107
|
+
"Authorization": f"Bearer {api_key}",
|
|
108
|
+
"Content-Type": "application/json",
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
messages = []
|
|
112
|
+
if system:
|
|
113
|
+
messages.append({"role": "system", "content": system})
|
|
114
|
+
messages.append({"role": "user", "content": prompt})
|
|
115
|
+
|
|
116
|
+
body = {
|
|
117
|
+
"model": model,
|
|
118
|
+
"messages": messages,
|
|
119
|
+
"temperature": temperature,
|
|
120
|
+
"max_tokens": max_tokens,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
with httpx.Client(timeout=60) as client:
|
|
124
|
+
resp = client.post(OPENAI_API_URL, headers=headers, json=body)
|
|
125
|
+
|
|
126
|
+
if resp.status_code != 200:
|
|
127
|
+
raise RuntimeError(f"OpenAI API error ({resp.status_code}): {resp.text}")
|
|
128
|
+
|
|
129
|
+
data = resp.json()
|
|
130
|
+
text = data["choices"][0]["message"]["content"]
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
"text": text,
|
|
134
|
+
"model": data.get("model", model),
|
|
135
|
+
"input_tokens": data.get("usage", {}).get("prompt_tokens", 0),
|
|
136
|
+
"output_tokens": data.get("usage", {}).get("completion_tokens", 0),
|
|
137
|
+
}
|
promptlab/reporter.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Reporter: pretty terminal output for test results."""
|
|
2
|
+
|
|
3
|
+
from rich.console import Console
|
|
4
|
+
from rich.table import Table
|
|
5
|
+
|
|
6
|
+
console = Console()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def print_results(results: list[dict], verbose: bool = False):
|
|
10
|
+
"""Print individual test results."""
|
|
11
|
+
console.print()
|
|
12
|
+
|
|
13
|
+
for result in results:
|
|
14
|
+
status = "[bold green]✅ PASS[/]" if result["passed"] else "[bold red]❌ FAIL[/]"
|
|
15
|
+
name = f"{result['suite']} :: {result['name']}"
|
|
16
|
+
elapsed = f"({result['elapsed']}s)"
|
|
17
|
+
|
|
18
|
+
console.print(f" {status} {name} [dim]{elapsed}[/]")
|
|
19
|
+
|
|
20
|
+
if result["error"]:
|
|
21
|
+
console.print(f" [red]Error: {result['error']}[/]")
|
|
22
|
+
|
|
23
|
+
if not result["passed"]:
|
|
24
|
+
for assertion in result["assertions"]:
|
|
25
|
+
if not assertion["passed"]:
|
|
26
|
+
console.print(f" [dim]Expected:[/] {assertion['expected']}")
|
|
27
|
+
if assertion.get("message"):
|
|
28
|
+
console.print(f" [dim]Reason:[/] {assertion['message']}")
|
|
29
|
+
|
|
30
|
+
if verbose and result["output"]:
|
|
31
|
+
console.print(f" [dim]Output:[/]")
|
|
32
|
+
# Indent and truncate output
|
|
33
|
+
output_lines = result["output"][:500].split("\n")
|
|
34
|
+
for line in output_lines:
|
|
35
|
+
console.print(f" [dim]{line}[/]")
|
|
36
|
+
if len(result["output"]) > 500:
|
|
37
|
+
console.print(f" [dim]... (truncated)[/]")
|
|
38
|
+
console.print()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def print_summary(results: list[dict], elapsed: float):
|
|
42
|
+
"""Print summary line."""
|
|
43
|
+
total = len(results)
|
|
44
|
+
passed = sum(1 for r in results if r["passed"])
|
|
45
|
+
failed = total - passed
|
|
46
|
+
|
|
47
|
+
total_input = sum(r.get("input_tokens", 0) for r in results)
|
|
48
|
+
total_output = sum(r.get("output_tokens", 0) for r in results)
|
|
49
|
+
|
|
50
|
+
console.print()
|
|
51
|
+
console.print("━" * 50)
|
|
52
|
+
|
|
53
|
+
if failed == 0:
|
|
54
|
+
console.print(
|
|
55
|
+
f" [bold green]Results: {passed} passed[/], "
|
|
56
|
+
f"{total} total ({elapsed:.1f}s)"
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
console.print(
|
|
60
|
+
f" [bold red]Results: {passed} passed, {failed} failed[/], "
|
|
61
|
+
f"{total} total ({elapsed:.1f}s)"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if total_input > 0 or total_output > 0:
|
|
65
|
+
console.print(
|
|
66
|
+
f" [dim]Tokens: {total_input:,} input, {total_output:,} output[/]"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
console.print()
|
promptlab/runner.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Test runner: executes test suites and collects results."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
from promptlab.assertions import check_assertion
|
|
6
|
+
from promptlab.loader import render_prompt
|
|
7
|
+
from promptlab.providers import call_llm
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_all_tests(suites: list[dict], verbose: bool = False) -> list[dict]:
|
|
11
|
+
"""Run all test suites and return results."""
|
|
12
|
+
results = []
|
|
13
|
+
|
|
14
|
+
for suite in suites:
|
|
15
|
+
suite_results = _run_suite(suite, verbose=verbose)
|
|
16
|
+
results.extend(suite_results)
|
|
17
|
+
|
|
18
|
+
return results
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _run_suite(suite: dict, verbose: bool = False) -> list[dict]:
|
|
22
|
+
"""Run a single test suite."""
|
|
23
|
+
results = []
|
|
24
|
+
file_name = suite["file"]
|
|
25
|
+
# Use the filename without extension as the suite name
|
|
26
|
+
suite_name = file_name.rsplit("/", 1)[-1].rsplit(".", 1)[0]
|
|
27
|
+
|
|
28
|
+
for test in suite["tests"]:
|
|
29
|
+
result = _run_single_test(suite, test, suite_name, verbose=verbose)
|
|
30
|
+
results.append(result)
|
|
31
|
+
|
|
32
|
+
return results
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _run_single_test(suite: dict, test: dict, suite_name: str, verbose: bool = False) -> dict:
|
|
36
|
+
"""Run a single test case."""
|
|
37
|
+
test_name = test["name"]
|
|
38
|
+
start = time.time()
|
|
39
|
+
|
|
40
|
+
# Render the prompt with variables
|
|
41
|
+
prompt = render_prompt(suite["prompt_template"], test["vars"])
|
|
42
|
+
|
|
43
|
+
# Call the LLM
|
|
44
|
+
try:
|
|
45
|
+
response = call_llm(
|
|
46
|
+
prompt=prompt,
|
|
47
|
+
model=suite["model"],
|
|
48
|
+
system=suite.get("system"),
|
|
49
|
+
temperature=suite.get("temperature", 0),
|
|
50
|
+
max_tokens=suite.get("max_tokens", 1024),
|
|
51
|
+
)
|
|
52
|
+
output = response["text"]
|
|
53
|
+
error = None
|
|
54
|
+
except Exception as e:
|
|
55
|
+
output = ""
|
|
56
|
+
error = str(e)
|
|
57
|
+
response = {"input_tokens": 0, "output_tokens": 0, "model": suite["model"]}
|
|
58
|
+
|
|
59
|
+
elapsed = time.time() - start
|
|
60
|
+
|
|
61
|
+
# Check assertions
|
|
62
|
+
assertion_results = []
|
|
63
|
+
all_passed = error is None
|
|
64
|
+
|
|
65
|
+
if error is None:
|
|
66
|
+
for assertion in test["assert"]:
|
|
67
|
+
result = check_assertion(assertion, output, suite["model"])
|
|
68
|
+
assertion_results.append(result)
|
|
69
|
+
if not result["passed"]:
|
|
70
|
+
all_passed = False
|
|
71
|
+
else:
|
|
72
|
+
all_passed = False
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
"suite": suite_name,
|
|
76
|
+
"name": test_name,
|
|
77
|
+
"passed": all_passed,
|
|
78
|
+
"elapsed": round(elapsed, 2),
|
|
79
|
+
"output": output,
|
|
80
|
+
"error": error,
|
|
81
|
+
"assertions": assertion_results,
|
|
82
|
+
"model": response.get("model", suite["model"]),
|
|
83
|
+
"input_tokens": response.get("input_tokens", 0),
|
|
84
|
+
"output_tokens": response.get("output_tokens", 0),
|
|
85
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptlab-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated testing for LLM prompts. Like pytest, but for prompts.
|
|
5
|
+
Project-URL: Homepage, https://github.com/vigp17/promptlab
|
|
6
|
+
Project-URL: Repository, https://github.com/vigp17/promptlab
|
|
7
|
+
Project-URL: Issues, https://github.com/vigp17/promptlab/issues
|
|
8
|
+
Author: Vignesh Pai
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: ai,claude,evaluation,llm,openai,prompts,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Testing
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: click>=8.0
|
|
19
|
+
Requires-Dist: httpx>=0.24.0
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Requires-Dist: rich>=13.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# promptlab ⚡
|
|
28
|
+
|
|
29
|
+
Automated testing for LLM prompts. Write test cases in YAML, run them against Claude or OpenAI, get pass/fail results in your terminal.
|
|
30
|
+
|
|
31
|
+
**Like pytest, but for prompts.**
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install promptlab-cli
|
|
35
|
+
promptlab run tests/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
✅ summarize_article :: returns_short_summary PASS (1.2s)
|
|
40
|
+
✅ summarize_article :: mentions_key_points PASS (1.1s)
|
|
41
|
+
❌ translate_text :: preserves_tone FAIL (0.9s)
|
|
42
|
+
Expected: contains "formal"
|
|
43
|
+
Got: "Here is the translated text in a casual style..."
|
|
44
|
+
|
|
45
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
46
|
+
Results: 2 passed, 1 failed, 3 total (3.2s)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Why?
|
|
50
|
+
|
|
51
|
+
You're building an app with Claude or GPT. Your prompt works today. Tomorrow you tweak it and something breaks. You don't notice until a user complains.
|
|
52
|
+
|
|
53
|
+
**promptlab catches prompt regressions before they ship.** Define what good output looks like, run tests on every change, and know immediately if something broke.
|
|
54
|
+
|
|
55
|
+
## Quickstart
|
|
56
|
+
|
|
57
|
+
### Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install promptlab-cli
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Set your API key
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
67
|
+
# or
|
|
68
|
+
export OPENAI_API_KEY=sk-...
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Write a test file
|
|
72
|
+
|
|
73
|
+
Create `tests/summarize.yaml`:
|
|
74
|
+
|
|
75
|
+
```yaml
|
|
76
|
+
prompt: |
|
|
77
|
+
Summarize this article in 2-3 sentences:
|
|
78
|
+
{{ article }}
|
|
79
|
+
|
|
80
|
+
model: claude-sonnet-4-20250514
|
|
81
|
+
|
|
82
|
+
tests:
|
|
83
|
+
- name: short_summary
|
|
84
|
+
vars:
|
|
85
|
+
article: |
|
|
86
|
+
The Federal Reserve held interest rates steady on Wednesday,
|
|
87
|
+
keeping the benchmark rate in the 5.25%-5.50% range. Chair
|
|
88
|
+
Jerome Powell said the committee needs more confidence that
|
|
89
|
+
inflation is moving toward the 2% target before cutting rates.
|
|
90
|
+
assert:
|
|
91
|
+
- type: max_tokens
|
|
92
|
+
value: 100
|
|
93
|
+
- type: contains
|
|
94
|
+
value: "Federal Reserve"
|
|
95
|
+
- type: contains
|
|
96
|
+
value: "interest rate"
|
|
97
|
+
|
|
98
|
+
- name: handles_empty_input
|
|
99
|
+
vars:
|
|
100
|
+
article: ""
|
|
101
|
+
assert:
|
|
102
|
+
- type: not_contains
|
|
103
|
+
value: "error"
|
|
104
|
+
- type: min_length
|
|
105
|
+
value: 10
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Run it
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
promptlab run tests/
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Test File Format
|
|
115
|
+
|
|
116
|
+
Each `.yaml` file defines a prompt and its test cases:
|
|
117
|
+
|
|
118
|
+
```yaml
|
|
119
|
+
# The prompt template. Use {{ variable }} for inputs.
|
|
120
|
+
prompt: |
|
|
121
|
+
You are a helpful assistant. {{ instruction }}
|
|
122
|
+
|
|
123
|
+
# Which model to use
|
|
124
|
+
model: claude-sonnet-4-20250514 # or gpt-4o, claude-haiku-4-5-20251001, etc.
|
|
125
|
+
|
|
126
|
+
# Optional system prompt
|
|
127
|
+
system: "You are a concise technical writer."
|
|
128
|
+
|
|
129
|
+
# Optional model parameters
|
|
130
|
+
temperature: 0
|
|
131
|
+
max_tokens: 500
|
|
132
|
+
|
|
133
|
+
# Test cases
|
|
134
|
+
tests:
|
|
135
|
+
- name: test_name
|
|
136
|
+
vars:
|
|
137
|
+
instruction: "Explain what a CPU does in one sentence."
|
|
138
|
+
assert:
|
|
139
|
+
- type: contains
|
|
140
|
+
value: "processor"
|
|
141
|
+
- type: max_tokens
|
|
142
|
+
value: 50
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Assertion Types
|
|
146
|
+
|
|
147
|
+
| Type | Description | Example |
|
|
148
|
+
|---|---|---|
|
|
149
|
+
| `contains` | Output must contain this string (case-insensitive) | `value: "machine learning"` |
|
|
150
|
+
| `not_contains` | Output must NOT contain this string | `value: "I'm sorry"` |
|
|
151
|
+
| `starts_with` | Output must start with this string | `value: "Sure"` |
|
|
152
|
+
| `regex` | Output must match this regex pattern | `value: "\\d{4}"` |
|
|
153
|
+
| `max_tokens` | Output must be at most N tokens | `value: 100` |
|
|
154
|
+
| `min_length` | Output must be at least N characters | `value: 50` |
|
|
155
|
+
| `max_length` | Output must be at most N characters | `value: 500` |
|
|
156
|
+
| `equals` | Output must exactly equal this string | `value: "42"` |
|
|
157
|
+
| `llm_judge` | Ask another LLM to evaluate the output | `value: "Is this response helpful and accurate?"` |
|
|
158
|
+
|
|
159
|
+
## LLM-as-Judge
|
|
160
|
+
|
|
161
|
+
The most powerful assertion type. Uses a second LLM call to evaluate output quality:
|
|
162
|
+
|
|
163
|
+
```yaml
|
|
164
|
+
tests:
|
|
165
|
+
- name: helpful_response
|
|
166
|
+
vars:
|
|
167
|
+
question: "How do I fix a memory leak in Python?"
|
|
168
|
+
assert:
|
|
169
|
+
- type: llm_judge
|
|
170
|
+
value: "Does this response provide specific, actionable debugging steps? Answer YES or NO."
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Supported Models
|
|
174
|
+
|
|
175
|
+
**Anthropic (Claude):**
|
|
176
|
+
- `claude-sonnet-4-20250514`
|
|
177
|
+
- `claude-haiku-4-5-20251001`
|
|
178
|
+
- Set `ANTHROPIC_API_KEY` environment variable
|
|
179
|
+
|
|
180
|
+
**OpenAI:**
|
|
181
|
+
- `gpt-4o`
|
|
182
|
+
- `gpt-4o-mini`
|
|
183
|
+
- Set `OPENAI_API_KEY` environment variable
|
|
184
|
+
|
|
185
|
+
## CLI Commands
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Run all test files in a directory
|
|
189
|
+
promptlab run tests/
|
|
190
|
+
|
|
191
|
+
# Run a single test file
|
|
192
|
+
promptlab run tests/summarize.yaml
|
|
193
|
+
|
|
194
|
+
# Verbose output (show full LLM responses)
|
|
195
|
+
promptlab run tests/ --verbose
|
|
196
|
+
|
|
197
|
+
# Output results as JSON
|
|
198
|
+
promptlab run tests/ --json
|
|
199
|
+
|
|
200
|
+
# Dry run (show what would be tested without calling APIs)
|
|
201
|
+
promptlab run tests/ --dry-run
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Use Cases
|
|
205
|
+
|
|
206
|
+
- **Prompt regression testing** — Run tests in CI/CD to catch regressions
|
|
207
|
+
- **Prompt comparison** — Test the same cases across different models
|
|
208
|
+
- **Guard rails validation** — Verify your prompt rejects harmful inputs
|
|
209
|
+
- **Output format checking** — Ensure structured output matches expectations
|
|
210
|
+
|
|
211
|
+
## Development
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
git clone https://github.com/vigp17/promptlab.git
|
|
215
|
+
cd promptlab
|
|
216
|
+
pip install -e ".[dev]"
|
|
217
|
+
pytest
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Roadmap
|
|
221
|
+
|
|
222
|
+
- [x] YAML test definitions
|
|
223
|
+
- [x] Claude and OpenAI support
|
|
224
|
+
- [x] 9 assertion types including LLM-as-judge
|
|
225
|
+
- [x] CLI with colored output
|
|
226
|
+
- [ ] Cost tracking per test run
|
|
227
|
+
- [ ] HTML report generation
|
|
228
|
+
- [ ] Parallel test execution
|
|
229
|
+
- [ ] GitHub Actions integration
|
|
230
|
+
- [ ] Prompt versioning and diff
|
|
231
|
+
- [ ] Custom scoring functions
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
promptlab/__init__.py,sha256=os5p2uKFDJzFm1RuGX_yAwXOc6Jktvbtd79J9voVwPE,75
|
|
2
|
+
promptlab/assertions.py,sha256=AD9CWAhktSJkh_4xtdKUipm3_PrTbmldxJ_iABsYWZw,5348
|
|
3
|
+
promptlab/cli.py,sha256=_c00XcyyzmKs-5t-c6XwB5MJKn13gCe6kjgnuR2ricA,2452
|
|
4
|
+
promptlab/loader.py,sha256=sg0aU0HA5OSPpl-cGWvUKCgPBwM5kfsK1xJltDCRmWQ,2579
|
|
5
|
+
promptlab/providers.py,sha256=C6aRNyieptpGNa4wylUtsJgGxBnwE5Fngnk_-WwZOfQ,3743
|
|
6
|
+
promptlab/reporter.py,sha256=QXjhWJpX5CtzH9tKAcAoI716bMfV998OQQP5lvm8e0M,2329
|
|
7
|
+
promptlab/runner.py,sha256=wogLZ4K2hyURl6vCG-SS3czfIMElWZmgePpYKeXtUMs,2520
|
|
8
|
+
promptlab_cli-0.1.0.dist-info/METADATA,sha256=9FnTyN5w9e_-AESCS2-ogLzn_WwH72dn797lIYGlRoU,6303
|
|
9
|
+
promptlab_cli-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
10
|
+
promptlab_cli-0.1.0.dist-info/entry_points.txt,sha256=-_chAvLXxNGzv9TkKKVTmZLMe9rBMgu7qPXxuU4Io1Y,49
|
|
11
|
+
promptlab_cli-0.1.0.dist-info/RECORD,,
|