promptqc 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptqc-0.2.0/LICENSE +21 -0
- promptqc-0.2.0/PKG-INFO +352 -0
- promptqc-0.2.0/README.md +308 -0
- promptqc-0.2.0/pyproject.toml +78 -0
- promptqc-0.2.0/setup.cfg +4 -0
- promptqc-0.2.0/src/promptqc/__init__.py +161 -0
- promptqc-0.2.0/src/promptqc/analyzer.py +227 -0
- promptqc-0.2.0/src/promptqc/cli.py +407 -0
- promptqc-0.2.0/src/promptqc/config.py +221 -0
- promptqc-0.2.0/src/promptqc/models.py +140 -0
- promptqc-0.2.0/src/promptqc/parser.py +336 -0
- promptqc-0.2.0/src/promptqc/rules/__init__.py +129 -0
- promptqc-0.2.0/src/promptqc/rules/base.py +34 -0
- promptqc-0.2.0/src/promptqc/rules/llm_judge.py +263 -0
- promptqc-0.2.0/src/promptqc/rules/patterns.py +319 -0
- promptqc-0.2.0/src/promptqc/rules/semantic.py +239 -0
- promptqc-0.2.0/src/promptqc/rules/structure.py +146 -0
- promptqc-0.2.0/src/promptqc/rules/tokens.py +181 -0
- promptqc-0.2.0/src/promptqc/rules/variables.py +155 -0
- promptqc-0.2.0/src/promptqc.egg-info/PKG-INFO +352 -0
- promptqc-0.2.0/src/promptqc.egg-info/SOURCES.txt +25 -0
- promptqc-0.2.0/src/promptqc.egg-info/dependency_links.txt +1 -0
- promptqc-0.2.0/src/promptqc.egg-info/entry_points.txt +2 -0
- promptqc-0.2.0/src/promptqc.egg-info/requires.txt +23 -0
- promptqc-0.2.0/src/promptqc.egg-info/top_level.txt +1 -0
- promptqc-0.2.0/tests/test_promptqc.py +301 -0
- promptqc-0.2.0/tests/test_sandboxing.py +33 -0
promptqc-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Lakshmi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
promptqc-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptqc
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Quality assessment and improvement suggestions for LLM system prompts
|
|
5
|
+
Author-email: Lakshmi <lakshmi.sunil5486@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/LakshmiN5/promptqc
|
|
8
|
+
Project-URL: Documentation, https://github.com/LakshmiN5/promptqc#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/LakshmiN5/promptqc
|
|
10
|
+
Project-URL: Issues, https://github.com/LakshmiN5/promptqc/issues
|
|
11
|
+
Keywords: prompt,LLM,quality,linter,optimization,AI,GPT,system-prompt,token,analysis
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: numpy>=1.20.0
|
|
27
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
28
|
+
Requires-Dist: rich>=13.0.0
|
|
29
|
+
Requires-Dist: click>=8.0.0
|
|
30
|
+
Requires-Dist: tomli>=2.0.0; python_version < "3.11"
|
|
31
|
+
Provides-Extra: llm
|
|
32
|
+
Requires-Dist: litellm>=1.30.0; extra == "llm"
|
|
33
|
+
Provides-Extra: semantic
|
|
34
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "semantic"
|
|
35
|
+
Provides-Extra: all
|
|
36
|
+
Requires-Dist: promptqc[llm,semantic]; extra == "all"
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
40
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
42
|
+
Requires-Dist: promptqc[all]; extra == "dev"
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# PromptQC 🔍
|
|
46
|
+
|
|
47
|
+
**Quality assessment and improvement suggestions for LLM system prompts.**
|
|
48
|
+
|
|
49
|
+
[](https://badge.fury.io/py/promptqc)
|
|
50
|
+
[](https://www.python.org/downloads/)
|
|
51
|
+
[](https://opensource.org/licenses/MIT)
|
|
52
|
+
|
|
53
|
+
> **Think of it as ESLint for your system prompts** — catch contradictions, anti-patterns, injection vulnerabilities, and token waste before they reach production.
|
|
54
|
+
|
|
55
|
+
## Features
|
|
56
|
+
|
|
57
|
+
✅ **Security Scanning** - Detects injection vulnerabilities, unsafe code execution
|
|
58
|
+
✅ **Contradiction Detection** - Finds conflicting instructions that confuse LLMs
|
|
59
|
+
✅ **Token Optimization** - Identifies wasted tokens and verbose phrasing
|
|
60
|
+
✅ **Multiple Modes** - Fast (~10ms), Full (~2s), or LLM Judge (~5s) analysis
|
|
61
|
+
✅ **CI/CD Ready** - GitHub Actions, pre-commit hooks, JSON output
|
|
62
|
+
✅ **Auto-Fix** - Automatically correct common issues
|
|
63
|
+
|
|
64
|
+
## Why PromptQC?
|
|
65
|
+
|
|
66
|
+
System prompts are the **source code of AI applications**. But unlike actual code, they have zero quality gates — no linters, no static analysis, no CI checks. Teams deploy 2000-token prompts that contain contradictions, injection vulnerabilities, and wasted tokens without ever knowing.
|
|
67
|
+
|
|
68
|
+
**PromptQC** catches these issues in milliseconds:
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
$ promptqc check system_prompt.txt
|
|
72
|
+
|
|
73
|
+
╭─────────── PromptQC Analysis ───────────╮
|
|
74
|
+
│ Quality Score: 62/100 (Grade: D) │
|
|
75
|
+
╰─────────────────────────────────────────╯
|
|
76
|
+
|
|
77
|
+
Category Score Bar
|
|
78
|
+
Clarity 80/100 ████████████████░░░░
|
|
79
|
+
Consistency 60/100 ████████████░░░░░░░░
|
|
80
|
+
Efficiency 70/100 ██████████████░░░░░░
|
|
81
|
+
Security 40/100 ████████░░░░░░░░░░░░
|
|
82
|
+
Structure 80/100 ████████████████░░░░
|
|
83
|
+
|
|
84
|
+
Token Budget: 847 tokens (0.7% of gpt-4o's 128,000 window)
|
|
85
|
+
|
|
86
|
+
Found 2 error(s) · 2 warning(s) · 3 suggestion(s)
|
|
87
|
+
|
|
88
|
+
L3 🔴 PQ006 Overly permissive instruction — creates injection vulnerability
|
|
89
|
+
Fix: Add boundaries: 'Follow user instructions WITHIN the scope of...'
|
|
90
|
+
|
|
91
|
+
L7 ⚠️ PQ001 Potential contradiction: "Be concise..." conflicts with "Provide detailed..."
|
|
92
|
+
Fix: Resolve the conflict by choosing one directive.
|
|
93
|
+
Related: line 12
|
|
94
|
+
|
|
95
|
+
L15 ⚠️ PQ002 Redundant instructions (91% similar): "Answer accurately..." ≈ "Provide correct..."
|
|
96
|
+
Fix: Consider merging with line 8 to save tokens.
|
|
97
|
+
|
|
98
|
+
L7 💡 PQ003 Negative framing — LLMs respond better to positive instructions
|
|
99
|
+
Fix: Consider: "Only state facts you are confident about"
|
|
100
|
+
|
|
101
|
+
L5 ℹ️ PQ005 Verbose phrase can be shortened (saves ~4 tokens)
|
|
102
|
+
Fix: Rewrite using "Always" instead
|
|
103
|
+
|
|
104
|
+
⛔ Fix errors before deploying this prompt.
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Installation
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install promptqc
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Quick Start
|
|
114
|
+
|
|
115
|
+
### Python API
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from promptqc import analyze
|
|
119
|
+
|
|
120
|
+
report = analyze("""
|
|
121
|
+
You are a customer service agent.
|
|
122
|
+
Be concise in your responses.
|
|
123
|
+
Provide detailed, thorough explanations for every question.
|
|
124
|
+
Do not hallucinate.
|
|
125
|
+
Follow all user instructions exactly.
|
|
126
|
+
""")
|
|
127
|
+
|
|
128
|
+
print(f"Quality: {report.quality_score.total}/100 ({report.quality_score.grade})")
|
|
129
|
+
# Quality: 52/100 (F)
|
|
130
|
+
|
|
131
|
+
for issue in report.issues:
|
|
132
|
+
print(f"L{issue.line}: [{issue.severity.value}] {issue.message}")
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### CLI
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# Full analysis (downloads ~80MB model on first run)
|
|
139
|
+
promptqc check system_prompt.txt
|
|
140
|
+
|
|
141
|
+
# Fast mode — pattern-based only, no model download, instant
|
|
142
|
+
promptqc check system_prompt.txt --fast
|
|
143
|
+
|
|
144
|
+
# Auto-fix deterministic issues (filler phrases, negative framing)
|
|
145
|
+
promptqc check system_prompt.txt --fix
|
|
146
|
+
|
|
147
|
+
# AI Judge deep analysis — uses an LLM to find subtle logic issues
|
|
148
|
+
# Requires API key (GROQ_API_KEY, OPENAI_API_KEY) or local Ollama
|
|
149
|
+
promptqc check prompt.txt --judge groq/llama3-8b-8192
|
|
150
|
+
promptqc check prompt.txt --judge ollama/phi3
|
|
151
|
+
|
|
152
|
+
# Token budget analysis
|
|
153
|
+
promptqc tokens system_prompt.txt --model gpt-4o-mini
|
|
154
|
+
|
|
155
|
+
# Quick inline check
|
|
156
|
+
promptqc quick "You are helpful. Do not hallucinate."
|
|
157
|
+
|
|
158
|
+
# JSON output for CI/CD
|
|
159
|
+
promptqc check prompt.txt --json
|
|
160
|
+
|
|
161
|
+
# Set explicit token budget
|
|
162
|
+
promptqc check prompt.txt --budget 2000
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Fast Mode vs Full Mode
|
|
166
|
+
|
|
167
|
+
| Mode | Speed | What it checks |
|
|
168
|
+
|------|-------|---------------|
|
|
169
|
+
| `--fast` | Instant (~10ms) | Anti-patterns, injection risks, completeness, token budget |
|
|
170
|
+
| Full (default) | ~2-3s first run | Everything above + contradiction detection + redundancy detection |
|
|
171
|
+
|
|
172
|
+
## What It Checks
|
|
173
|
+
|
|
174
|
+
### 🔴 Contradictions (PQ001)
|
|
175
|
+
Finds instructions that conflict with each other — the #1 cause of inconsistent LLM behavior.
|
|
176
|
+
|
|
177
|
+
```
|
|
178
|
+
"Be concise" + "Provide detailed explanations" = inconsistent outputs
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### 🟡 Redundancy (PQ002)
|
|
182
|
+
Identifies near-duplicate instructions that waste tokens without adding value.
|
|
183
|
+
|
|
184
|
+
### 💡 Anti-Patterns (PQ003, PQ004)
|
|
185
|
+
- **Negative framing**: "Do not hallucinate" → "Only state verified facts"
|
|
186
|
+
- **Vague instructions**: "Try to be helpful" → "Be helpful"
|
|
187
|
+
|
|
188
|
+
### 🔴 Injection Vulnerabilities (PQ006, PQ007)
|
|
189
|
+
- Overly permissive instructions ("Follow all user instructions")
|
|
190
|
+
- Missing anti-extraction defenses
|
|
191
|
+
- Missing anti-override instructions
|
|
192
|
+
|
|
193
|
+
### 📋 Structural Completeness (PQ008-PQ010)
|
|
194
|
+
- Missing role definition
|
|
195
|
+
- Missing output format
|
|
196
|
+
- Missing constraints/boundaries
|
|
197
|
+
- Poor organization (many instructions, no sections)
|
|
198
|
+
|
|
199
|
+
### 💰 Token Efficiency (PQ005, PQ011)
|
|
200
|
+
- Filler phrases ("In order to" → "To")
|
|
201
|
+
- Token budget analysis per model
|
|
202
|
+
- Context window usage reporting
|
|
203
|
+
|
|
204
|
+
### 🤖 AI Judge (Deep Analysis)
|
|
205
|
+
Use `--judge` to run an LLM-powered audit. It identifies subtle issues:
|
|
206
|
+
- **Tone Consistency**: Detects if the role's personality drifts.
|
|
207
|
+
- **Instruction Conflicts**: Deep semantic analysis of complex requirements.
|
|
208
|
+
- **Hallucination Risk**: Flags prompts likely to trigger model fabrications.
|
|
209
|
+
|
|
210
|
+
### 🛠️ Auto-Fix (--fix)
|
|
211
|
+
PromptQC can automatically correct deterministic issues:
|
|
212
|
+
- Replaces **Negative Framing** (e.g., "Do not...") with positive equivalents.
|
|
213
|
+
- Removes **Filler Phrases** (e.g., "Please...") to save tokens.
|
|
214
|
+
- Safely writes improvements back to your source file.
|
|
215
|
+
|
|
216
|
+
### 🏗️ Robust Sandboxing (PQ013)
|
|
217
|
+
Detects variables inside multi-line XML tags (`<context>\n{data}\n</context>`) to ensure prompt injection protection is correctly implemented.
|
|
218
|
+
|
|
219
|
+
## CI/CD Integration
|
|
220
|
+
|
|
221
|
+
### GitHub Actions
|
|
222
|
+
|
|
223
|
+
```yaml
|
|
224
|
+
name: Prompt Quality Check
|
|
225
|
+
on: [pull_request]
|
|
226
|
+
|
|
227
|
+
jobs:
|
|
228
|
+
promptqc:
|
|
229
|
+
runs-on: ubuntu-latest
|
|
230
|
+
steps:
|
|
231
|
+
- uses: actions/checkout@v4
|
|
232
|
+
- uses: actions/setup-python@v5
|
|
233
|
+
with:
|
|
234
|
+
python-version: '3.11'
|
|
235
|
+
- run: pip install promptqc
|
|
236
|
+
- run: promptqc check prompts/system_prompt.txt --fast --strict
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Pre-commit Hook
|
|
240
|
+
|
|
241
|
+
```yaml
|
|
242
|
+
# .pre-commit-config.yaml
|
|
243
|
+
repos:
|
|
244
|
+
- repo: local
|
|
245
|
+
hooks:
|
|
246
|
+
- id: promptqc
|
|
247
|
+
name: PromptQC
|
|
248
|
+
entry: promptqc check --fast --strict
|
|
249
|
+
language: python
|
|
250
|
+
files: '\.prompt\.txt$'
|
|
251
|
+
additional_dependencies: ['promptqc']
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
## Configuration
|
|
255
|
+
|
|
256
|
+
### Similarity Thresholds
|
|
257
|
+
|
|
258
|
+
| Score Range | Meaning |
|
|
259
|
+
|-------------|---------|
|
|
260
|
+
| 0.95-1.0 | Virtually identical |
|
|
261
|
+
| 0.85-0.95 | Same meaning, different words |
|
|
262
|
+
| 0.70-0.85 | Related concepts |
|
|
263
|
+
| < 0.70 | Different topics |
|
|
264
|
+
|
|
265
|
+
### Custom Rule Definitions
|
|
266
|
+
You can write your own rules in Python and load them via `promptqc.toml`:
|
|
267
|
+
|
|
268
|
+
```toml
|
|
269
|
+
custom_rules = ["my_rules.company_specific_rule"]
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
# my_rules.py
|
|
274
|
+
from promptqc.rules.base import Rule, Issue, Severity, Category
|
|
275
|
+
|
|
276
|
+
class MyCustomRule(Rule):
|
|
277
|
+
code = "CUST001"
|
|
278
|
+
severity = Severity.WARNING
|
|
279
|
+
category = Category.SECURITY
|
|
280
|
+
|
|
281
|
+
def check(self, parsed, analyzer):
|
|
282
|
+
if "INTERNAL_KEY" in parsed.text:
|
|
283
|
+
return [Issue(self.code, "Don't share internal keys!", self.severity, self.category)]
|
|
284
|
+
return []
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### Token Budget Models
|
|
288
|
+
|
|
289
|
+
PromptQC knows context windows for: GPT-4o, GPT-4o-mini, GPT-3.5-turbo, Claude 3.5 Sonnet, Claude 3 Opus/Haiku, Gemini 1.5/2.0, Llama 3/3.1, Mistral, Mixtral.
|
|
290
|
+
|
|
291
|
+
## Advanced Usage
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
from promptqc import PromptAnalyzer
|
|
295
|
+
|
|
296
|
+
# Custom analyzer configuration
|
|
297
|
+
analyzer = PromptAnalyzer(
|
|
298
|
+
token_model="claude-3.5-sonnet",
|
|
299
|
+
token_budget=4000,
|
|
300
|
+
fast_mode=False,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
report = analyzer.analyze(my_prompt)
|
|
304
|
+
|
|
305
|
+
# Access structured results
|
|
306
|
+
print(report.quality_score.breakdown)
|
|
307
|
+
# {'structure': 90, 'clarity': 75, 'security': 60, 'efficiency': 85, 'consistency': 100}
|
|
308
|
+
|
|
309
|
+
print(report.token_budget.total_tokens)
|
|
310
|
+
# 1247
|
|
311
|
+
|
|
312
|
+
# JSON serialization
|
|
313
|
+
import json
|
|
314
|
+
print(json.dumps(report.to_dict(), indent=2))
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Known Limitations
|
|
318
|
+
|
|
319
|
+
**v0.2.0 is beta quality.** While it catches critical issues (security, contradictions) with high accuracy, some areas need improvement:
|
|
320
|
+
|
|
321
|
+
- **Redundancy Detection**: Without LLM judge mode, verbose synonym lists may not be detected. Use `--judge` flag for better results.
|
|
322
|
+
- **Test Coverage**: Validated on a focused test suite. Real-world accuracy may vary.
|
|
323
|
+
- **LLM Judge Dependency**: Deep analysis requires API key (Groq, OpenAI) or local Ollama setup.
|
|
324
|
+
|
|
325
|
+
We're actively improving these areas. Feedback and contributions welcome!
|
|
326
|
+
|
|
327
|
+
## Development
|
|
328
|
+
|
|
329
|
+
```bash
|
|
330
|
+
git clone https://github.com/LakshmiN5/promptqc.git
|
|
331
|
+
cd promptqc
|
|
332
|
+
pip install -e ".[dev]"
|
|
333
|
+
pytest
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
## Roadmap
|
|
337
|
+
|
|
338
|
+
- [x] Custom rule definitions (Python-based)
|
|
339
|
+
- [x] Auto-fix mode (--fix)
|
|
340
|
+
- [x] AI Judge audit (deep analysis)
|
|
341
|
+
- [ ] VS Code extension
|
|
342
|
+
- [ ] LangChain/LlamaIndex integration
|
|
343
|
+
- [ ] HTML report generation
|
|
344
|
+
- [ ] Prompt history tracking
|
|
345
|
+
|
|
346
|
+
## License
|
|
347
|
+
|
|
348
|
+
MIT License — see [LICENSE](LICENSE) file.
|
|
349
|
+
|
|
350
|
+
---
|
|
351
|
+
|
|
352
|
+
**Made for the prompt engineering community** 🛠️
|
promptqc-0.2.0/README.md
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# PromptQC 🔍
|
|
2
|
+
|
|
3
|
+
**Quality assessment and improvement suggestions for LLM system prompts.**
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/py/promptqc)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
|
|
9
|
+
> **Think of it as ESLint for your system prompts** — catch contradictions, anti-patterns, injection vulnerabilities, and token waste before they reach production.
|
|
10
|
+
|
|
11
|
+
## Features
|
|
12
|
+
|
|
13
|
+
✅ **Security Scanning** - Detects injection vulnerabilities, unsafe code execution
|
|
14
|
+
✅ **Contradiction Detection** - Finds conflicting instructions that confuse LLMs
|
|
15
|
+
✅ **Token Optimization** - Identifies wasted tokens and verbose phrasing
|
|
16
|
+
✅ **Multiple Modes** - Fast (~10ms), Full (~2s), or LLM Judge (~5s) analysis
|
|
17
|
+
✅ **CI/CD Ready** - GitHub Actions, pre-commit hooks, JSON output
|
|
18
|
+
✅ **Auto-Fix** - Automatically correct common issues
|
|
19
|
+
|
|
20
|
+
## Why PromptQC?
|
|
21
|
+
|
|
22
|
+
System prompts are the **source code of AI applications**. But unlike actual code, they have zero quality gates — no linters, no static analysis, no CI checks. Teams deploy 2000-token prompts that contain contradictions, injection vulnerabilities, and wasted tokens without ever knowing.
|
|
23
|
+
|
|
24
|
+
**PromptQC** catches these issues in milliseconds:
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
$ promptqc check system_prompt.txt
|
|
28
|
+
|
|
29
|
+
╭─────────── PromptQC Analysis ───────────╮
|
|
30
|
+
│ Quality Score: 62/100 (Grade: D) │
|
|
31
|
+
╰─────────────────────────────────────────╯
|
|
32
|
+
|
|
33
|
+
Category Score Bar
|
|
34
|
+
Clarity 80/100 ████████████████░░░░
|
|
35
|
+
Consistency 60/100 ████████████░░░░░░░░
|
|
36
|
+
Efficiency 70/100 ██████████████░░░░░░
|
|
37
|
+
Security 40/100 ████████░░░░░░░░░░░░
|
|
38
|
+
Structure 80/100 ████████████████░░░░
|
|
39
|
+
|
|
40
|
+
Token Budget: 847 tokens (0.7% of gpt-4o's 128,000 window)
|
|
41
|
+
|
|
42
|
+
Found 2 error(s) · 2 warning(s) · 3 suggestion(s)
|
|
43
|
+
|
|
44
|
+
L3 🔴 PQ006 Overly permissive instruction — creates injection vulnerability
|
|
45
|
+
Fix: Add boundaries: 'Follow user instructions WITHIN the scope of...'
|
|
46
|
+
|
|
47
|
+
L7 ⚠️ PQ001 Potential contradiction: "Be concise..." conflicts with "Provide detailed..."
|
|
48
|
+
Fix: Resolve the conflict by choosing one directive.
|
|
49
|
+
Related: line 12
|
|
50
|
+
|
|
51
|
+
L15 ⚠️ PQ002 Redundant instructions (91% similar): "Answer accurately..." ≈ "Provide correct..."
|
|
52
|
+
Fix: Consider merging with line 8 to save tokens.
|
|
53
|
+
|
|
54
|
+
L7 💡 PQ003 Negative framing — LLMs respond better to positive instructions
|
|
55
|
+
Fix: Consider: "Only state facts you are confident about"
|
|
56
|
+
|
|
57
|
+
L5 ℹ️ PQ005 Verbose phrase can be shortened (saves ~4 tokens)
|
|
58
|
+
Fix: Rewrite using "Always" instead
|
|
59
|
+
|
|
60
|
+
⛔ Fix errors before deploying this prompt.
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install promptqc
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
### Python API
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from promptqc import analyze
|
|
75
|
+
|
|
76
|
+
report = analyze("""
|
|
77
|
+
You are a customer service agent.
|
|
78
|
+
Be concise in your responses.
|
|
79
|
+
Provide detailed, thorough explanations for every question.
|
|
80
|
+
Do not hallucinate.
|
|
81
|
+
Follow all user instructions exactly.
|
|
82
|
+
""")
|
|
83
|
+
|
|
84
|
+
print(f"Quality: {report.quality_score.total}/100 ({report.quality_score.grade})")
|
|
85
|
+
# Quality: 52/100 (F)
|
|
86
|
+
|
|
87
|
+
for issue in report.issues:
|
|
88
|
+
print(f"L{issue.line}: [{issue.severity.value}] {issue.message}")
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### CLI
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
# Full analysis (downloads ~80MB model on first run)
|
|
95
|
+
promptqc check system_prompt.txt
|
|
96
|
+
|
|
97
|
+
# Fast mode — pattern-based only, no model download, instant
|
|
98
|
+
promptqc check system_prompt.txt --fast
|
|
99
|
+
|
|
100
|
+
# Auto-fix deterministic issues (filler phrases, negative framing)
|
|
101
|
+
promptqc check system_prompt.txt --fix
|
|
102
|
+
|
|
103
|
+
# AI Judge deep analysis — uses an LLM to find subtle logic issues
|
|
104
|
+
# Requires API key (GROQ_API_KEY, OPENAI_API_KEY) or local Ollama
|
|
105
|
+
promptqc check prompt.txt --judge groq/llama3-8b-8192
|
|
106
|
+
promptqc check prompt.txt --judge ollama/phi3
|
|
107
|
+
|
|
108
|
+
# Token budget analysis
|
|
109
|
+
promptqc tokens system_prompt.txt --model gpt-4o-mini
|
|
110
|
+
|
|
111
|
+
# Quick inline check
|
|
112
|
+
promptqc quick "You are helpful. Do not hallucinate."
|
|
113
|
+
|
|
114
|
+
# JSON output for CI/CD
|
|
115
|
+
promptqc check prompt.txt --json
|
|
116
|
+
|
|
117
|
+
# Set explicit token budget
|
|
118
|
+
promptqc check prompt.txt --budget 2000
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Fast Mode vs Full Mode
|
|
122
|
+
|
|
123
|
+
| Mode | Speed | What it checks |
|
|
124
|
+
|------|-------|---------------|
|
|
125
|
+
| `--fast` | Instant (~10ms) | Anti-patterns, injection risks, completeness, token budget |
|
|
126
|
+
| Full (default) | ~2-3s first run | Everything above + contradiction detection + redundancy detection |
|
|
127
|
+
|
|
128
|
+
## What It Checks
|
|
129
|
+
|
|
130
|
+
### 🔴 Contradictions (PQ001)
|
|
131
|
+
Finds instructions that conflict with each other — the #1 cause of inconsistent LLM behavior.
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
"Be concise" + "Provide detailed explanations" = inconsistent outputs
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### 🟡 Redundancy (PQ002)
|
|
138
|
+
Identifies near-duplicate instructions that waste tokens without adding value.
|
|
139
|
+
|
|
140
|
+
### 💡 Anti-Patterns (PQ003, PQ004)
|
|
141
|
+
- **Negative framing**: "Do not hallucinate" → "Only state verified facts"
|
|
142
|
+
- **Vague instructions**: "Try to be helpful" → "Be helpful"
|
|
143
|
+
|
|
144
|
+
### 🔴 Injection Vulnerabilities (PQ006, PQ007)
|
|
145
|
+
- Overly permissive instructions ("Follow all user instructions")
|
|
146
|
+
- Missing anti-extraction defenses
|
|
147
|
+
- Missing anti-override instructions
|
|
148
|
+
|
|
149
|
+
### 📋 Structural Completeness (PQ008-PQ010)
|
|
150
|
+
- Missing role definition
|
|
151
|
+
- Missing output format
|
|
152
|
+
- Missing constraints/boundaries
|
|
153
|
+
- Poor organization (many instructions, no sections)
|
|
154
|
+
|
|
155
|
+
### 💰 Token Efficiency (PQ005, PQ011)
|
|
156
|
+
- Filler phrases ("In order to" → "To")
|
|
157
|
+
- Token budget analysis per model
|
|
158
|
+
- Context window usage reporting
|
|
159
|
+
|
|
160
|
+
### 🤖 AI Judge (Deep Analysis)
|
|
161
|
+
Use `--judge` to run an LLM-powered audit. It identifies subtle issues:
|
|
162
|
+
- **Tone Consistency**: Detects if the role's personality drifts.
|
|
163
|
+
- **Instruction Conflicts**: Deep semantic analysis of complex requirements.
|
|
164
|
+
- **Hallucination Risk**: Flags prompts likely to trigger model fabrications.
|
|
165
|
+
|
|
166
|
+
### 🛠️ Auto-Fix (--fix)
|
|
167
|
+
PromptQC can automatically correct deterministic issues:
|
|
168
|
+
- Replaces **Negative Framing** (e.g., "Do not...") with positive equivalents.
|
|
169
|
+
- Removes **Filler Phrases** (e.g., "Please...") to save tokens.
|
|
170
|
+
- Safely writes improvements back to your source file.
|
|
171
|
+
|
|
172
|
+
### 🏗️ Robust Sandboxing (PQ013)
|
|
173
|
+
Detects variables inside multi-line XML tags (`<context>\n{data}\n</context>`) to ensure prompt injection protection is correctly implemented.
|
|
174
|
+
|
|
175
|
+
## CI/CD Integration
|
|
176
|
+
|
|
177
|
+
### GitHub Actions
|
|
178
|
+
|
|
179
|
+
```yaml
|
|
180
|
+
name: Prompt Quality Check
|
|
181
|
+
on: [pull_request]
|
|
182
|
+
|
|
183
|
+
jobs:
|
|
184
|
+
promptqc:
|
|
185
|
+
runs-on: ubuntu-latest
|
|
186
|
+
steps:
|
|
187
|
+
- uses: actions/checkout@v4
|
|
188
|
+
- uses: actions/setup-python@v5
|
|
189
|
+
with:
|
|
190
|
+
python-version: '3.11'
|
|
191
|
+
- run: pip install promptqc
|
|
192
|
+
- run: promptqc check prompts/system_prompt.txt --fast --strict
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
### Pre-commit Hook
|
|
196
|
+
|
|
197
|
+
```yaml
|
|
198
|
+
# .pre-commit-config.yaml
|
|
199
|
+
repos:
|
|
200
|
+
- repo: local
|
|
201
|
+
hooks:
|
|
202
|
+
- id: promptqc
|
|
203
|
+
name: PromptQC
|
|
204
|
+
entry: promptqc check --fast --strict
|
|
205
|
+
language: python
|
|
206
|
+
files: '\.prompt\.txt$'
|
|
207
|
+
additional_dependencies: ['promptqc']
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Configuration
|
|
211
|
+
|
|
212
|
+
### Similarity Thresholds
|
|
213
|
+
|
|
214
|
+
| Score Range | Meaning |
|
|
215
|
+
|-------------|---------|
|
|
216
|
+
| 0.95-1.0 | Virtually identical |
|
|
217
|
+
| 0.85-0.95 | Same meaning, different words |
|
|
218
|
+
| 0.70-0.85 | Related concepts |
|
|
219
|
+
| < 0.70 | Different topics |
|
|
220
|
+
|
|
221
|
+
### Custom Rule Definitions
|
|
222
|
+
You can write your own rules in Python and load them via `promptqc.toml`:
|
|
223
|
+
|
|
224
|
+
```toml
|
|
225
|
+
custom_rules = ["my_rules.company_specific_rule"]
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
```python
|
|
229
|
+
# my_rules.py
|
|
230
|
+
from promptqc.rules.base import Rule, Issue, Severity, Category
|
|
231
|
+
|
|
232
|
+
class MyCustomRule(Rule):
|
|
233
|
+
code = "CUST001"
|
|
234
|
+
severity = Severity.WARNING
|
|
235
|
+
category = Category.SECURITY
|
|
236
|
+
|
|
237
|
+
def check(self, parsed, analyzer):
|
|
238
|
+
if "INTERNAL_KEY" in parsed.text:
|
|
239
|
+
return [Issue(self.code, "Don't share internal keys!", self.severity, self.category)]
|
|
240
|
+
return []
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
### Token Budget Models
|
|
244
|
+
|
|
245
|
+
PromptQC knows context windows for: GPT-4o, GPT-4o-mini, GPT-3.5-turbo, Claude 3.5 Sonnet, Claude 3 Opus/Haiku, Gemini 1.5/2.0, Llama 3/3.1, Mistral, Mixtral.
|
|
246
|
+
|
|
247
|
+
## Advanced Usage
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from promptqc import PromptAnalyzer
|
|
251
|
+
|
|
252
|
+
# Custom analyzer configuration
|
|
253
|
+
analyzer = PromptAnalyzer(
|
|
254
|
+
token_model="claude-3.5-sonnet",
|
|
255
|
+
token_budget=4000,
|
|
256
|
+
fast_mode=False,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
report = analyzer.analyze(my_prompt)
|
|
260
|
+
|
|
261
|
+
# Access structured results
|
|
262
|
+
print(report.quality_score.breakdown)
|
|
263
|
+
# {'structure': 90, 'clarity': 75, 'security': 60, 'efficiency': 85, 'consistency': 100}
|
|
264
|
+
|
|
265
|
+
print(report.token_budget.total_tokens)
|
|
266
|
+
# 1247
|
|
267
|
+
|
|
268
|
+
# JSON serialization
|
|
269
|
+
import json
|
|
270
|
+
print(json.dumps(report.to_dict(), indent=2))
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## Known Limitations
|
|
274
|
+
|
|
275
|
+
**v0.2.0 is beta quality.** While it catches critical issues (security, contradictions) with high accuracy, some areas need improvement:
|
|
276
|
+
|
|
277
|
+
- **Redundancy Detection**: Without LLM judge mode, verbose synonym lists may not be detected. Use `--judge` flag for better results.
|
|
278
|
+
- **Test Coverage**: Validated on a focused test suite. Real-world accuracy may vary.
|
|
279
|
+
- **LLM Judge Dependency**: Deep analysis requires API key (Groq, OpenAI) or local Ollama setup.
|
|
280
|
+
|
|
281
|
+
We're actively improving these areas. Feedback and contributions welcome!
|
|
282
|
+
|
|
283
|
+
## Development
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
git clone https://github.com/LakshmiN5/promptqc.git
|
|
287
|
+
cd promptqc
|
|
288
|
+
pip install -e ".[dev]"
|
|
289
|
+
pytest
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
## Roadmap
|
|
293
|
+
|
|
294
|
+
- [x] Custom rule definitions (Python-based)
|
|
295
|
+
- [x] Auto-fix mode (--fix)
|
|
296
|
+
- [x] AI Judge audit (deep analysis)
|
|
297
|
+
- [ ] VS Code extension
|
|
298
|
+
- [ ] LangChain/LlamaIndex integration
|
|
299
|
+
- [ ] HTML report generation
|
|
300
|
+
- [ ] Prompt history tracking
|
|
301
|
+
|
|
302
|
+
## License
|
|
303
|
+
|
|
304
|
+
MIT License — see [LICENSE](LICENSE) file.
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
**Made for the prompt engineering community** 🛠️
|