promptlab-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptlab_cli-0.1.0/PKG-INFO +235 -0
- promptlab_cli-0.1.0/README.md +209 -0
- promptlab_cli-0.1.0/examples/summarize.yaml +34 -0
- promptlab_cli-0.1.0/pyproject.toml +50 -0
- promptlab_cli-0.1.0/src/promptlab/__init__.py +3 -0
- promptlab_cli-0.1.0/src/promptlab/assertions.py +182 -0
- promptlab_cli-0.1.0/src/promptlab/cli.py +81 -0
- promptlab_cli-0.1.0/src/promptlab/loader.py +89 -0
- promptlab_cli-0.1.0/src/promptlab/providers.py +137 -0
- promptlab_cli-0.1.0/src/promptlab/reporter.py +69 -0
- promptlab_cli-0.1.0/src/promptlab/runner.py +85 -0
- promptlab_cli-0.1.0/tests/test_assertions.py +110 -0
- promptlab_cli-0.1.0/tests/test_loader.py +128 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptlab-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated testing for LLM prompts. Like pytest, but for prompts.
|
|
5
|
+
Project-URL: Homepage, https://github.com/vigp17/promptlab
|
|
6
|
+
Project-URL: Repository, https://github.com/vigp17/promptlab
|
|
7
|
+
Project-URL: Issues, https://github.com/vigp17/promptlab/issues
|
|
8
|
+
Author: Vignesh Pai
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
Keywords: ai,claude,evaluation,llm,openai,prompts,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Testing
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: click>=8.0
|
|
19
|
+
Requires-Dist: httpx>=0.24.0
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Requires-Dist: rich>=13.0
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
|
|
27
|
+
# promptlab ⚡
|
|
28
|
+
|
|
29
|
+
Automated testing for LLM prompts. Write test cases in YAML, run them against Claude or OpenAI, get pass/fail results in your terminal.
|
|
30
|
+
|
|
31
|
+
**Like pytest, but for prompts.**
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install promptlab-cli
|
|
35
|
+
promptlab run tests/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
✅ summarize_article :: returns_short_summary PASS (1.2s)
|
|
40
|
+
✅ summarize_article :: mentions_key_points PASS (1.1s)
|
|
41
|
+
❌ translate_text :: preserves_tone FAIL (0.9s)
|
|
42
|
+
Expected: contains "formal"
|
|
43
|
+
Got: "Here is the translated text in a casual style..."
|
|
44
|
+
|
|
45
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
46
|
+
Results: 2 passed, 1 failed, 3 total (3.2s)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Why?
|
|
50
|
+
|
|
51
|
+
You're building an app with Claude or GPT. Your prompt works today. Tomorrow you tweak it and something breaks. You don't notice until a user complains.
|
|
52
|
+
|
|
53
|
+
**promptlab catches prompt regressions before they ship.** Define what good output looks like, run tests on every change, and know immediately if something broke.
|
|
54
|
+
|
|
55
|
+
## Quickstart
|
|
56
|
+
|
|
57
|
+
### Install
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install promptlab-cli
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Set your API key
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
67
|
+
# or
|
|
68
|
+
export OPENAI_API_KEY=sk-...
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Write a test file
|
|
72
|
+
|
|
73
|
+
Create `tests/summarize.yaml`:
|
|
74
|
+
|
|
75
|
+
```yaml
|
|
76
|
+
prompt: |
|
|
77
|
+
Summarize this article in 2-3 sentences:
|
|
78
|
+
{{ article }}
|
|
79
|
+
|
|
80
|
+
model: claude-sonnet-4-20250514
|
|
81
|
+
|
|
82
|
+
tests:
|
|
83
|
+
- name: short_summary
|
|
84
|
+
vars:
|
|
85
|
+
article: |
|
|
86
|
+
The Federal Reserve held interest rates steady on Wednesday,
|
|
87
|
+
keeping the benchmark rate in the 5.25%-5.50% range. Chair
|
|
88
|
+
Jerome Powell said the committee needs more confidence that
|
|
89
|
+
inflation is moving toward the 2% target before cutting rates.
|
|
90
|
+
assert:
|
|
91
|
+
- type: max_tokens
|
|
92
|
+
value: 100
|
|
93
|
+
- type: contains
|
|
94
|
+
value: "Federal Reserve"
|
|
95
|
+
- type: contains
|
|
96
|
+
value: "interest rate"
|
|
97
|
+
|
|
98
|
+
- name: handles_empty_input
|
|
99
|
+
vars:
|
|
100
|
+
article: ""
|
|
101
|
+
assert:
|
|
102
|
+
- type: not_contains
|
|
103
|
+
value: "error"
|
|
104
|
+
- type: min_length
|
|
105
|
+
value: 10
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Run it
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
promptlab run tests/
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Test File Format
|
|
115
|
+
|
|
116
|
+
Each `.yaml` file defines a prompt and its test cases:
|
|
117
|
+
|
|
118
|
+
```yaml
|
|
119
|
+
# The prompt template. Use {{ variable }} for inputs.
|
|
120
|
+
prompt: |
|
|
121
|
+
You are a helpful assistant. {{ instruction }}
|
|
122
|
+
|
|
123
|
+
# Which model to use
|
|
124
|
+
model: claude-sonnet-4-20250514 # or gpt-4o, claude-haiku-4-5-20251001, etc.
|
|
125
|
+
|
|
126
|
+
# Optional system prompt
|
|
127
|
+
system: "You are a concise technical writer."
|
|
128
|
+
|
|
129
|
+
# Optional model parameters
|
|
130
|
+
temperature: 0
|
|
131
|
+
max_tokens: 500
|
|
132
|
+
|
|
133
|
+
# Test cases
|
|
134
|
+
tests:
|
|
135
|
+
- name: test_name
|
|
136
|
+
vars:
|
|
137
|
+
instruction: "Explain what a CPU does in one sentence."
|
|
138
|
+
assert:
|
|
139
|
+
- type: contains
|
|
140
|
+
value: "processor"
|
|
141
|
+
- type: max_tokens
|
|
142
|
+
value: 50
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Assertion Types
|
|
146
|
+
|
|
147
|
+
| Type | Description | Example |
|
|
148
|
+
|---|---|---|
|
|
149
|
+
| `contains` | Output must contain this string (case-insensitive) | `value: "machine learning"` |
|
|
150
|
+
| `not_contains` | Output must NOT contain this string | `value: "I'm sorry"` |
|
|
151
|
+
| `starts_with` | Output must start with this string | `value: "Sure"` |
|
|
152
|
+
| `regex` | Output must match this regex pattern | `value: "\\d{4}"` |
|
|
153
|
+
| `max_tokens` | Output must be at most N tokens | `value: 100` |
|
|
154
|
+
| `min_length` | Output must be at least N characters | `value: 50` |
|
|
155
|
+
| `max_length` | Output must be at most N characters | `value: 500` |
|
|
156
|
+
| `equals` | Output must exactly equal this string | `value: "42"` |
|
|
157
|
+
| `llm_judge` | Ask another LLM to evaluate the output | `value: "Is this response helpful and accurate?"` |
|
|
158
|
+
|
|
159
|
+
## LLM-as-Judge
|
|
160
|
+
|
|
161
|
+
The most powerful assertion type. Uses a second LLM call to evaluate output quality:
|
|
162
|
+
|
|
163
|
+
```yaml
|
|
164
|
+
tests:
|
|
165
|
+
- name: helpful_response
|
|
166
|
+
vars:
|
|
167
|
+
question: "How do I fix a memory leak in Python?"
|
|
168
|
+
assert:
|
|
169
|
+
- type: llm_judge
|
|
170
|
+
value: "Does this response provide specific, actionable debugging steps? Answer YES or NO."
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Supported Models
|
|
174
|
+
|
|
175
|
+
**Anthropic (Claude):**
|
|
176
|
+
- `claude-sonnet-4-20250514`
|
|
177
|
+
- `claude-haiku-4-5-20251001`
|
|
178
|
+
- Set `ANTHROPIC_API_KEY` environment variable
|
|
179
|
+
|
|
180
|
+
**OpenAI:**
|
|
181
|
+
- `gpt-4o`
|
|
182
|
+
- `gpt-4o-mini`
|
|
183
|
+
- Set `OPENAI_API_KEY` environment variable
|
|
184
|
+
|
|
185
|
+
## CLI Commands
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Run all test files in a directory
|
|
189
|
+
promptlab run tests/
|
|
190
|
+
|
|
191
|
+
# Run a single test file
|
|
192
|
+
promptlab run tests/summarize.yaml
|
|
193
|
+
|
|
194
|
+
# Verbose output (show full LLM responses)
|
|
195
|
+
promptlab run tests/ --verbose
|
|
196
|
+
|
|
197
|
+
# Output results as JSON
|
|
198
|
+
promptlab run tests/ --json
|
|
199
|
+
|
|
200
|
+
# Dry run (show what would be tested without calling APIs)
|
|
201
|
+
promptlab run tests/ --dry-run
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Use Cases
|
|
205
|
+
|
|
206
|
+
- **Prompt regression testing** — Run tests in CI/CD to catch regressions
|
|
207
|
+
- **Prompt comparison** — Test the same cases across different models
|
|
208
|
+
- **Guard rails validation** — Verify your prompt rejects harmful inputs
|
|
209
|
+
- **Output format checking** — Ensure structured output matches expectations
|
|
210
|
+
|
|
211
|
+
## Development
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
git clone https://github.com/vigp17/promptlab.git
|
|
215
|
+
cd promptlab
|
|
216
|
+
pip install -e ".[dev]"
|
|
217
|
+
pytest
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Roadmap
|
|
221
|
+
|
|
222
|
+
- [x] YAML test definitions
|
|
223
|
+
- [x] Claude and OpenAI support
|
|
224
|
+
- [x] 9 assertion types including LLM-as-judge
|
|
225
|
+
- [x] CLI with colored output
|
|
226
|
+
- [ ] Cost tracking per test run
|
|
227
|
+
- [ ] HTML report generation
|
|
228
|
+
- [ ] Parallel test execution
|
|
229
|
+
- [ ] GitHub Actions integration
|
|
230
|
+
- [ ] Prompt versioning and diff
|
|
231
|
+
- [ ] Custom scoring functions
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# promptlab ⚡
|
|
2
|
+
|
|
3
|
+
Automated testing for LLM prompts. Write test cases in YAML, run them against Claude or OpenAI, get pass/fail results in your terminal.
|
|
4
|
+
|
|
5
|
+
**Like pytest, but for prompts.**
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install promptlab-cli
|
|
9
|
+
promptlab run tests/
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
✅ summarize_article :: returns_short_summary PASS (1.2s)
|
|
14
|
+
✅ summarize_article :: mentions_key_points PASS (1.1s)
|
|
15
|
+
❌ translate_text :: preserves_tone FAIL (0.9s)
|
|
16
|
+
Expected: contains "formal"
|
|
17
|
+
Got: "Here is the translated text in a casual style..."
|
|
18
|
+
|
|
19
|
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
|
20
|
+
Results: 2 passed, 1 failed, 3 total (3.2s)
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Why?
|
|
24
|
+
|
|
25
|
+
You're building an app with Claude or GPT. Your prompt works today. Tomorrow you tweak it and something breaks. You don't notice until a user complains.
|
|
26
|
+
|
|
27
|
+
**promptlab catches prompt regressions before they ship.** Define what good output looks like, run tests on every change, and know immediately if something broke.
|
|
28
|
+
|
|
29
|
+
## Quickstart
|
|
30
|
+
|
|
31
|
+
### Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install promptlab-cli
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Set your API key
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
41
|
+
# or
|
|
42
|
+
export OPENAI_API_KEY=sk-...
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Write a test file
|
|
46
|
+
|
|
47
|
+
Create `tests/summarize.yaml`:
|
|
48
|
+
|
|
49
|
+
```yaml
|
|
50
|
+
prompt: |
|
|
51
|
+
Summarize this article in 2-3 sentences:
|
|
52
|
+
{{ article }}
|
|
53
|
+
|
|
54
|
+
model: claude-sonnet-4-20250514
|
|
55
|
+
|
|
56
|
+
tests:
|
|
57
|
+
- name: short_summary
|
|
58
|
+
vars:
|
|
59
|
+
article: |
|
|
60
|
+
The Federal Reserve held interest rates steady on Wednesday,
|
|
61
|
+
keeping the benchmark rate in the 5.25%-5.50% range. Chair
|
|
62
|
+
Jerome Powell said the committee needs more confidence that
|
|
63
|
+
inflation is moving toward the 2% target before cutting rates.
|
|
64
|
+
assert:
|
|
65
|
+
- type: max_tokens
|
|
66
|
+
value: 100
|
|
67
|
+
- type: contains
|
|
68
|
+
value: "Federal Reserve"
|
|
69
|
+
- type: contains
|
|
70
|
+
value: "interest rate"
|
|
71
|
+
|
|
72
|
+
- name: handles_empty_input
|
|
73
|
+
vars:
|
|
74
|
+
article: ""
|
|
75
|
+
assert:
|
|
76
|
+
- type: not_contains
|
|
77
|
+
value: "error"
|
|
78
|
+
- type: min_length
|
|
79
|
+
value: 10
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Run it
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
promptlab run tests/
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Test File Format
|
|
89
|
+
|
|
90
|
+
Each `.yaml` file defines a prompt and its test cases:
|
|
91
|
+
|
|
92
|
+
```yaml
|
|
93
|
+
# The prompt template. Use {{ variable }} for inputs.
|
|
94
|
+
prompt: |
|
|
95
|
+
You are a helpful assistant. {{ instruction }}
|
|
96
|
+
|
|
97
|
+
# Which model to use
|
|
98
|
+
model: claude-sonnet-4-20250514 # or gpt-4o, claude-haiku-4-5-20251001, etc.
|
|
99
|
+
|
|
100
|
+
# Optional system prompt
|
|
101
|
+
system: "You are a concise technical writer."
|
|
102
|
+
|
|
103
|
+
# Optional model parameters
|
|
104
|
+
temperature: 0
|
|
105
|
+
max_tokens: 500
|
|
106
|
+
|
|
107
|
+
# Test cases
|
|
108
|
+
tests:
|
|
109
|
+
- name: test_name
|
|
110
|
+
vars:
|
|
111
|
+
instruction: "Explain what a CPU does in one sentence."
|
|
112
|
+
assert:
|
|
113
|
+
- type: contains
|
|
114
|
+
value: "processor"
|
|
115
|
+
- type: max_tokens
|
|
116
|
+
value: 50
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Assertion Types
|
|
120
|
+
|
|
121
|
+
| Type | Description | Example |
|
|
122
|
+
|---|---|---|
|
|
123
|
+
| `contains` | Output must contain this string (case-insensitive) | `value: "machine learning"` |
|
|
124
|
+
| `not_contains` | Output must NOT contain this string | `value: "I'm sorry"` |
|
|
125
|
+
| `starts_with` | Output must start with this string | `value: "Sure"` |
|
|
126
|
+
| `regex` | Output must match this regex pattern | `value: "\\d{4}"` |
|
|
127
|
+
| `max_tokens` | Output must be at most N tokens | `value: 100` |
|
|
128
|
+
| `min_length` | Output must be at least N characters | `value: 50` |
|
|
129
|
+
| `max_length` | Output must be at most N characters | `value: 500` |
|
|
130
|
+
| `equals` | Output must exactly equal this string | `value: "42"` |
|
|
131
|
+
| `llm_judge` | Ask another LLM to evaluate the output | `value: "Is this response helpful and accurate?"` |
|
|
132
|
+
|
|
133
|
+
## LLM-as-Judge
|
|
134
|
+
|
|
135
|
+
The most powerful assertion type. Uses a second LLM call to evaluate output quality:
|
|
136
|
+
|
|
137
|
+
```yaml
|
|
138
|
+
tests:
|
|
139
|
+
- name: helpful_response
|
|
140
|
+
vars:
|
|
141
|
+
question: "How do I fix a memory leak in Python?"
|
|
142
|
+
assert:
|
|
143
|
+
- type: llm_judge
|
|
144
|
+
value: "Does this response provide specific, actionable debugging steps? Answer YES or NO."
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Supported Models
|
|
148
|
+
|
|
149
|
+
**Anthropic (Claude):**
|
|
150
|
+
- `claude-sonnet-4-20250514`
|
|
151
|
+
- `claude-haiku-4-5-20251001`
|
|
152
|
+
- Set `ANTHROPIC_API_KEY` environment variable
|
|
153
|
+
|
|
154
|
+
**OpenAI:**
|
|
155
|
+
- `gpt-4o`
|
|
156
|
+
- `gpt-4o-mini`
|
|
157
|
+
- Set `OPENAI_API_KEY` environment variable
|
|
158
|
+
|
|
159
|
+
## CLI Commands
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Run all test files in a directory
|
|
163
|
+
promptlab run tests/
|
|
164
|
+
|
|
165
|
+
# Run a single test file
|
|
166
|
+
promptlab run tests/summarize.yaml
|
|
167
|
+
|
|
168
|
+
# Verbose output (show full LLM responses)
|
|
169
|
+
promptlab run tests/ --verbose
|
|
170
|
+
|
|
171
|
+
# Output results as JSON
|
|
172
|
+
promptlab run tests/ --json
|
|
173
|
+
|
|
174
|
+
# Dry run (show what would be tested without calling APIs)
|
|
175
|
+
promptlab run tests/ --dry-run
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Use Cases
|
|
179
|
+
|
|
180
|
+
- **Prompt regression testing** — Run tests in CI/CD to catch regressions
|
|
181
|
+
- **Prompt comparison** — Test the same cases across different models
|
|
182
|
+
- **Guard rails validation** — Verify your prompt rejects harmful inputs
|
|
183
|
+
- **Output format checking** — Ensure structured output matches expectations
|
|
184
|
+
|
|
185
|
+
## Development
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/vigp17/promptlab.git
|
|
189
|
+
cd promptlab
|
|
190
|
+
pip install -e ".[dev]"
|
|
191
|
+
pytest
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Roadmap
|
|
195
|
+
|
|
196
|
+
- [x] YAML test definitions
|
|
197
|
+
- [x] Claude and OpenAI support
|
|
198
|
+
- [x] 9 assertion types including LLM-as-judge
|
|
199
|
+
- [x] CLI with colored output
|
|
200
|
+
- [ ] Cost tracking per test run
|
|
201
|
+
- [ ] HTML report generation
|
|
202
|
+
- [ ] Parallel test execution
|
|
203
|
+
- [ ] GitHub Actions integration
|
|
204
|
+
- [ ] Prompt versioning and diff
|
|
205
|
+
- [ ] Custom scoring functions
|
|
206
|
+
|
|
207
|
+
## License
|
|
208
|
+
|
|
209
|
+
MIT
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
prompt: |
|
|
2
|
+
Summarize this text in 2-3 sentences:
|
|
3
|
+
{{ text }}
|
|
4
|
+
|
|
5
|
+
model: claude-sonnet-4-20250514
|
|
6
|
+
temperature: 0
|
|
7
|
+
max_tokens: 200
|
|
8
|
+
|
|
9
|
+
tests:
|
|
10
|
+
- name: short_summary
|
|
11
|
+
vars:
|
|
12
|
+
text: |
|
|
13
|
+
The Federal Reserve held interest rates steady on Wednesday,
|
|
14
|
+
keeping the benchmark rate in the 5.25%-5.50% range. Chair
|
|
15
|
+
Jerome Powell said the committee needs more confidence that
|
|
16
|
+
inflation is moving toward the 2% target before cutting rates.
|
|
17
|
+
Markets had been expecting a cut, but Powell's comments suggest
|
|
18
|
+
the Fed will wait for more data before making any changes.
|
|
19
|
+
assert:
|
|
20
|
+
- type: contains
|
|
21
|
+
value: "Federal Reserve"
|
|
22
|
+
- type: contains
|
|
23
|
+
value: "rate"
|
|
24
|
+
- type: max_tokens
|
|
25
|
+
value: 80
|
|
26
|
+
|
|
27
|
+
- name: handles_short_input
|
|
28
|
+
vars:
|
|
29
|
+
text: "The sky is blue."
|
|
30
|
+
assert:
|
|
31
|
+
- type: min_length
|
|
32
|
+
value: 10
|
|
33
|
+
- type: max_tokens
|
|
34
|
+
value: 50
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "promptlab-cli"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Automated testing for LLM prompts. Like pytest, but for prompts."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Vignesh Pai" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["llm", "testing", "prompts", "claude", "openai", "evaluation", "ai"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Topic :: Software Development :: Testing",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
]
|
|
24
|
+
dependencies = [
|
|
25
|
+
"click>=8.0",
|
|
26
|
+
"pyyaml>=6.0",
|
|
27
|
+
"httpx>=0.24.0",
|
|
28
|
+
"rich>=13.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = [
|
|
33
|
+
"pytest>=7.0",
|
|
34
|
+
"ruff>=0.1.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
promptlab = "promptlab.cli:main"
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
Homepage = "https://github.com/vigp17/promptlab"
|
|
42
|
+
Repository = "https://github.com/vigp17/promptlab"
|
|
43
|
+
Issues = "https://github.com/vigp17/promptlab/issues"
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
target-version = "py310"
|
|
47
|
+
line-length = 100
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.wheel]
|
|
50
|
+
packages = ["src/promptlab"]
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Assertion types for evaluating LLM outputs."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from promptlab.providers import call_llm
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_assertion(assertion: dict, output: str, model: str) -> dict:
|
|
9
|
+
"""Check a single assertion against an LLM output.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
dict with keys: passed (bool), type, expected, got, message
|
|
13
|
+
"""
|
|
14
|
+
atype = assertion["type"]
|
|
15
|
+
value = assertion.get("value")
|
|
16
|
+
|
|
17
|
+
checkers = {
|
|
18
|
+
"contains": _check_contains,
|
|
19
|
+
"not_contains": _check_not_contains,
|
|
20
|
+
"starts_with": _check_starts_with,
|
|
21
|
+
"regex": _check_regex,
|
|
22
|
+
"equals": _check_equals,
|
|
23
|
+
"max_tokens": _check_max_tokens,
|
|
24
|
+
"min_length": _check_min_length,
|
|
25
|
+
"max_length": _check_max_length,
|
|
26
|
+
"llm_judge": _check_llm_judge,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
checker = checkers.get(atype)
|
|
30
|
+
if checker is None:
|
|
31
|
+
return {
|
|
32
|
+
"passed": False,
|
|
33
|
+
"type": atype,
|
|
34
|
+
"expected": value,
|
|
35
|
+
"got": None,
|
|
36
|
+
"message": f"Unknown assertion type: {atype}",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if atype == "llm_judge":
|
|
40
|
+
return checker(output, value, model)
|
|
41
|
+
return checker(output, value)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _check_contains(output: str, value: str) -> dict:
|
|
45
|
+
passed = value.lower() in output.lower()
|
|
46
|
+
return {
|
|
47
|
+
"passed": passed,
|
|
48
|
+
"type": "contains",
|
|
49
|
+
"expected": f'contains "{value}"',
|
|
50
|
+
"got": output[:200] + "..." if len(output) > 200 else output,
|
|
51
|
+
"message": "" if passed else f'Output does not contain "{value}"',
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _check_not_contains(output: str, value: str) -> dict:
|
|
56
|
+
passed = value.lower() not in output.lower()
|
|
57
|
+
return {
|
|
58
|
+
"passed": passed,
|
|
59
|
+
"type": "not_contains",
|
|
60
|
+
"expected": f'does not contain "{value}"',
|
|
61
|
+
"got": output[:200] + "..." if len(output) > 200 else output,
|
|
62
|
+
"message": "" if passed else f'Output contains "{value}" (should not)',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _check_starts_with(output: str, value: str) -> dict:
|
|
67
|
+
passed = output.strip().lower().startswith(value.lower())
|
|
68
|
+
return {
|
|
69
|
+
"passed": passed,
|
|
70
|
+
"type": "starts_with",
|
|
71
|
+
"expected": f'starts with "{value}"',
|
|
72
|
+
"got": output[:100],
|
|
73
|
+
"message": "" if passed else f'Output does not start with "{value}"',
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _check_regex(output: str, value: str) -> dict:
|
|
78
|
+
try:
|
|
79
|
+
passed = bool(re.search(value, output))
|
|
80
|
+
except re.error as e:
|
|
81
|
+
return {
|
|
82
|
+
"passed": False,
|
|
83
|
+
"type": "regex",
|
|
84
|
+
"expected": f"matches /{value}/",
|
|
85
|
+
"got": None,
|
|
86
|
+
"message": f"Invalid regex: {e}",
|
|
87
|
+
}
|
|
88
|
+
return {
|
|
89
|
+
"passed": passed,
|
|
90
|
+
"type": "regex",
|
|
91
|
+
"expected": f"matches /{value}/",
|
|
92
|
+
"got": output[:200] + "..." if len(output) > 200 else output,
|
|
93
|
+
"message": "" if passed else f"Output does not match regex /{value}/",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _check_equals(output: str, value: str) -> dict:
|
|
98
|
+
passed = output.strip() == value.strip()
|
|
99
|
+
return {
|
|
100
|
+
"passed": passed,
|
|
101
|
+
"type": "equals",
|
|
102
|
+
"expected": value,
|
|
103
|
+
"got": output.strip(),
|
|
104
|
+
"message": "" if passed else "Output does not exactly match expected value",
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _check_max_tokens(output: str, value: int) -> dict:
|
|
109
|
+
# Rough token estimate: ~4 chars per token
|
|
110
|
+
token_estimate = len(output.split())
|
|
111
|
+
passed = token_estimate <= value
|
|
112
|
+
return {
|
|
113
|
+
"passed": passed,
|
|
114
|
+
"type": "max_tokens",
|
|
115
|
+
"expected": f"<= {value} tokens",
|
|
116
|
+
"got": f"~{token_estimate} tokens ({len(output)} chars)",
|
|
117
|
+
"message": "" if passed else f"Output has ~{token_estimate} tokens, max is {value}",
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _check_min_length(output: str, value: int) -> dict:
|
|
122
|
+
length = len(output.strip())
|
|
123
|
+
passed = length >= value
|
|
124
|
+
return {
|
|
125
|
+
"passed": passed,
|
|
126
|
+
"type": "min_length",
|
|
127
|
+
"expected": f">= {value} chars",
|
|
128
|
+
"got": f"{length} chars",
|
|
129
|
+
"message": "" if passed else f"Output is {length} chars, minimum is {value}",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _check_max_length(output: str, value: int) -> dict:
|
|
134
|
+
length = len(output.strip())
|
|
135
|
+
passed = length <= value
|
|
136
|
+
return {
|
|
137
|
+
"passed": passed,
|
|
138
|
+
"type": "max_length",
|
|
139
|
+
"expected": f"<= {value} chars",
|
|
140
|
+
"got": f"{length} chars",
|
|
141
|
+
"message": "" if passed else f"Output is {length} chars, maximum is {value}",
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _check_llm_judge(output: str, criteria: str, model: str) -> dict:
|
|
146
|
+
"""Use an LLM to judge the output quality."""
|
|
147
|
+
judge_prompt = f"""You are evaluating an LLM output. Answer only YES or NO.
|
|
148
|
+
|
|
149
|
+
Criteria: {criteria}
|
|
150
|
+
|
|
151
|
+
Output to evaluate:
|
|
152
|
+
---
|
|
153
|
+
{output}
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
Does the output meet the criteria? Answer only YES or NO."""
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
response = call_llm(
|
|
160
|
+
prompt=judge_prompt,
|
|
161
|
+
model=model,
|
|
162
|
+
temperature=0,
|
|
163
|
+
max_tokens=10,
|
|
164
|
+
)
|
|
165
|
+
answer = response["text"].strip().upper()
|
|
166
|
+
passed = answer.startswith("YES")
|
|
167
|
+
except Exception as e:
|
|
168
|
+
return {
|
|
169
|
+
"passed": False,
|
|
170
|
+
"type": "llm_judge",
|
|
171
|
+
"expected": criteria,
|
|
172
|
+
"got": f"Judge error: {e}",
|
|
173
|
+
"message": f"LLM judge failed: {e}",
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
"passed": passed,
|
|
178
|
+
"type": "llm_judge",
|
|
179
|
+
"expected": criteria,
|
|
180
|
+
"got": f"Judge answered: {answer}",
|
|
181
|
+
"message": "" if passed else f"LLM judge said NO to: {criteria}",
|
|
182
|
+
}
|