promptdebug 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promptdebug-0.2.0/.github/ISSUE_TEMPLATE/bug_report.md +25 -0
- promptdebug-0.2.0/.github/ISSUE_TEMPLATE/feature_request.md +19 -0
- promptdebug-0.2.0/.github/workflows/ci.yml +39 -0
- promptdebug-0.2.0/.github/workflows/publish.yml +25 -0
- promptdebug-0.2.0/.gitignore +29 -0
- promptdebug-0.2.0/CHANGELOG.md +52 -0
- promptdebug-0.2.0/LICENSE +21 -0
- promptdebug-0.2.0/PKG-INFO +346 -0
- promptdebug-0.2.0/README.md +300 -0
- promptdebug-0.2.0/THIRD_PARTY_LICENSES.md +101 -0
- promptdebug-0.2.0/examples/coding_assistant.txt +48 -0
- promptdebug-0.2.0/examples/content_moderator.txt +112 -0
- promptdebug-0.2.0/examples/customer_support.txt +61 -0
- promptdebug-0.2.0/examples/data_extractor.txt +191 -0
- promptdebug-0.2.0/examples/rag_pipeline.txt +58 -0
- promptdebug-0.2.0/pyproject.toml +93 -0
- promptdebug-0.2.0/src/promptdebug/__init__.py +54 -0
- promptdebug-0.2.0/src/promptdebug/ablation.py +568 -0
- promptdebug-0.2.0/src/promptdebug/cache.py +125 -0
- promptdebug-0.2.0/src/promptdebug/cli.py +743 -0
- promptdebug-0.2.0/src/promptdebug/config.py +125 -0
- promptdebug-0.2.0/src/promptdebug/parser.py +376 -0
- promptdebug-0.2.0/src/promptdebug/providers.py +82 -0
- promptdebug-0.2.0/src/promptdebug/renderer.py +419 -0
- promptdebug-0.2.0/src/promptdebug/scoring.py +569 -0
- promptdebug-0.2.0/tests/__init__.py +0 -0
- promptdebug-0.2.0/tests/test_ablation.py +125 -0
- promptdebug-0.2.0/tests/test_ablation_stress.py +735 -0
- promptdebug-0.2.0/tests/test_cache.py +123 -0
- promptdebug-0.2.0/tests/test_cache_stress.py +470 -0
- promptdebug-0.2.0/tests/test_cli.py +205 -0
- promptdebug-0.2.0/tests/test_config.py +93 -0
- promptdebug-0.2.0/tests/test_config_stress.py +477 -0
- promptdebug-0.2.0/tests/test_coverage_gaps.py +1535 -0
- promptdebug-0.2.0/tests/test_integration.py +485 -0
- promptdebug-0.2.0/tests/test_integration_real.py +209 -0
- promptdebug-0.2.0/tests/test_new_features_stress.py +1259 -0
- promptdebug-0.2.0/tests/test_parser.py +137 -0
- promptdebug-0.2.0/tests/test_parser_stress.py +769 -0
- promptdebug-0.2.0/tests/test_providers.py +288 -0
- promptdebug-0.2.0/tests/test_renderer.py +69 -0
- promptdebug-0.2.0/tests/test_renderer_stress.py +861 -0
- promptdebug-0.2.0/tests/test_scoring.py +112 -0
- promptdebug-0.2.0/tests/test_scoring_stress.py +1011 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Bug report
|
|
3
|
+
about: Report a bug in promptdebug
|
|
4
|
+
title: ''
|
|
5
|
+
labels: bug
|
|
6
|
+
assignees: ''
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
**Describe the bug**
|
|
10
|
+
A clear description of what the bug is.
|
|
11
|
+
|
|
12
|
+
**To Reproduce**
|
|
13
|
+
```bash
|
|
14
|
+
# Minimal command to reproduce
|
|
15
|
+
promptdebug analyze prompt.txt --query "..."
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**Expected behavior**
|
|
19
|
+
What you expected to happen.
|
|
20
|
+
|
|
21
|
+
**Environment**
|
|
22
|
+
- promptdebug version:
|
|
23
|
+
- Python version:
|
|
24
|
+
- OS:
|
|
25
|
+
- Model used:
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: Feature request
|
|
3
|
+
about: Suggest an idea for promptdebug
|
|
4
|
+
title: ''
|
|
5
|
+
labels: enhancement
|
|
6
|
+
assignees: ''
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
**Is your feature request related to a problem?**
|
|
10
|
+
A clear description of the problem.
|
|
11
|
+
|
|
12
|
+
**Describe the solution you'd like**
|
|
13
|
+
What you want to happen.
|
|
14
|
+
|
|
15
|
+
**Describe alternatives you've considered**
|
|
16
|
+
Other solutions you've thought about.
|
|
17
|
+
|
|
18
|
+
**Additional context**
|
|
19
|
+
Any other context about the feature request.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12", "3.13", "3.14-dev"]
|
|
16
|
+
include:
|
|
17
|
+
- python-version: "3.14-dev"
|
|
18
|
+
experimental: true
|
|
19
|
+
|
|
20
|
+
continue-on-error: ${{ matrix.experimental || false }}
|
|
21
|
+
steps:
|
|
22
|
+
- uses: actions/checkout@v4
|
|
23
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
24
|
+
uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: ${{ matrix.python-version }}
|
|
27
|
+
allow-prereleases: true
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: |
|
|
30
|
+
python -m pip install --upgrade pip
|
|
31
|
+
pip install -e ".[dev]"
|
|
32
|
+
- name: Lint with ruff
|
|
33
|
+
run: ruff check src/
|
|
34
|
+
- name: Type check with mypy
|
|
35
|
+
run: mypy src/
|
|
36
|
+
- name: Run tests
|
|
37
|
+
run: pytest tests/ --ignore=tests/test_integration.py --ignore=tests/test_integration_real.py
|
|
38
|
+
env:
|
|
39
|
+
NO_COLOR: "1"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment: pypi
|
|
11
|
+
permissions:
|
|
12
|
+
id-token: write # Required for trusted publishing
|
|
13
|
+
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.11"
|
|
20
|
+
- name: Install build dependencies
|
|
21
|
+
run: pip install build hatchling
|
|
22
|
+
- name: Build package
|
|
23
|
+
run: python -m build
|
|
24
|
+
- name: Publish to PyPI
|
|
25
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
.env
|
|
2
|
+
.env.*
|
|
3
|
+
.venv/
|
|
4
|
+
__pycache__/
|
|
5
|
+
*.pyc
|
|
6
|
+
*.pyo
|
|
7
|
+
*.egg-info/
|
|
8
|
+
*.egg
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
.promptdebug_cache.db
|
|
12
|
+
.promptdebug_cache.db-shm
|
|
13
|
+
.promptdebug_cache.db-wal
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.mypy_cache/
|
|
16
|
+
.ruff_cache/
|
|
17
|
+
.coverage
|
|
18
|
+
htmlcov/
|
|
19
|
+
*.so
|
|
20
|
+
*.log
|
|
21
|
+
*.swp
|
|
22
|
+
*.swo
|
|
23
|
+
.DS_Store
|
|
24
|
+
.claude/
|
|
25
|
+
.idea/
|
|
26
|
+
.vscode/
|
|
27
|
+
docs/
|
|
28
|
+
pilot/.venv/
|
|
29
|
+
pilot/results/
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.2.0] - 2026-03-08
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **Multi-query ablation** (`--queries queries.txt`): run ablation across multiple test queries and aggregate influence scores, giving a more robust, query-independent result
|
|
13
|
+
- **Sanity check** (`--sanity-check`): inject a known-high-influence counterfactual section and verify the scoring engine detects it — flags unreliable analyses before you act on them
|
|
14
|
+
- **Rewrite suggestions** (`--suggest`): for every dead section, generate LLM-powered replacement candidates via `generate_all_suggestions`
|
|
15
|
+
- **Watch mode** (`promptdebug watch prompt.txt --query "..."`): poll a prompt file and re-run analysis automatically on every save — no more manual re-runs during iteration
|
|
16
|
+
- **Diff command** (`promptdebug diff prompt.txt --ref HEAD~1 --query "..."`): compare influence scores between the current prompt and any git revision
|
|
17
|
+
- **Integration test suite** (`tests/test_integration.py`): 27 tests that make live OpenAI API calls — covers provider round-trips, ablation structure, multi-query aggregation, sanity check, suggestions, cache, and the full customer-support pipeline
|
|
18
|
+
- **Coverage gap tests** (`tests/test_coverage_gaps.py`): 187 tests filling every untested branch across all 8 modules (scoring, parser, renderer, config, providers, ablation, cli, cache)
|
|
19
|
+
- **Stress tests for new features** (`tests/test_new_features_stress.py`): 68 adversarial tests targeting the 5 new features
|
|
20
|
+
|
|
21
|
+
### Infrastructure
|
|
22
|
+
|
|
23
|
+
- Added CI workflow (GitHub Actions) running tests on Python 3.10–3.14 with ruff and mypy
|
|
24
|
+
- Added trusted-publisher workflow for automatic PyPI release on GitHub release events
|
|
25
|
+
- Added GitHub issue templates (bug report, feature request)
|
|
26
|
+
- Added `ruff`, `mypy`, `types-PyYAML`, `build`, `twine` to dev dependencies
|
|
27
|
+
- Fixed `.gitignore` to exclude SQLite WAL files (`.db-shm`, `.db-wal`)
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
|
|
31
|
+
- `np.mean([])` crash in multi-query aggregation when a per-query run has fewer sections than the reference — now falls back gracefully to the reference score
|
|
32
|
+
- `generate_all_suggestions` was executing sequentially despite using `asyncio.Semaphore` — fixed to use `asyncio.gather` for true parallelism
|
|
33
|
+
- `watch` command triggered an immediate spurious analysis on startup — fixed by initialising `last_mtime` / `last_content` from the actual file state
|
|
34
|
+
- `analyze` command read the queries file twice (once for sanity check, once for multi-query) — consolidated to a single pre-read
|
|
35
|
+
- `diff` command failed with absolute paths — `git show ref:/absolute/path` always errors; fixed by resolving the repo-relative path via `git rev-parse --show-toplevel`
|
|
36
|
+
- Sanity check section was merged into the last existing section by the parser — fixed by adding a `## __calibration_control__` markdown header so the parser creates a distinct section
|
|
37
|
+
|
|
38
|
+
## [0.1.0] - 2026-03-07
|
|
39
|
+
|
|
40
|
+
### Added
|
|
41
|
+
|
|
42
|
+
- Core ablation engine with leave-one-out section removal
|
|
43
|
+
- Composite influence scoring: semantic (0.60) + structural (0.20) + behavioral (0.20)
|
|
44
|
+
- Semantic similarity via sentence-transformers (all-mpnet-base-v2)
|
|
45
|
+
- Smart prompt parsing with 6 strategies: markdown headers, XML tags, labeled blocks, numbered lists, double newlines, single-section fallback
|
|
46
|
+
- CLI commands: `analyze`, `compare`, `optimize`
|
|
47
|
+
- Output formats: terminal heatmap, HTML report, JSON, CSV
|
|
48
|
+
- SQLite-based response cache with SHA256 content-hash keys
|
|
49
|
+
- Multi-model support via LiteLLM (OpenAI, Anthropic, Google, Mistral, Ollama)
|
|
50
|
+
- Dry-run mode with cost estimation
|
|
51
|
+
- YAML configuration file support (`.promptdebug.yml`)
|
|
52
|
+
- 503 tests with full coverage of all modules
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Zaur Jafarov
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: promptdebug
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Find dead tokens in your system prompts. Ablation-based influence analysis for LLM prompts.
|
|
5
|
+
Project-URL: Homepage, https://github.com/entropyvector/promptdebug
|
|
6
|
+
Project-URL: Documentation, https://github.com/entropyvector/promptdebug#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/entropyvector/promptdebug
|
|
8
|
+
Project-URL: Issues, https://github.com/entropyvector/promptdebug/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/entropyvector/promptdebug/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Zaur Jafarov <entropyvector.dev@gmail.com>
|
|
11
|
+
License-Expression: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: ablation,debugging,llm,optimization,prompt
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Software Development :: Testing
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Requires-Dist: jinja2>=3.1.0
|
|
29
|
+
Requires-Dist: litellm>=1.40.0
|
|
30
|
+
Requires-Dist: numpy>=1.26.0
|
|
31
|
+
Requires-Dist: pyyaml>=6.0
|
|
32
|
+
Requires-Dist: rich>=13.0.0
|
|
33
|
+
Requires-Dist: sentence-transformers>=3.0.0
|
|
34
|
+
Requires-Dist: tiktoken>=0.7.0
|
|
35
|
+
Requires-Dist: typer>=0.12.0
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: twine>=4.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# promptdebug
|
|
48
|
+
|
|
49
|
+
[](https://pypi.org/project/promptdebug/)
|
|
50
|
+
[](https://pepy.tech/project/promptdebug)
|
|
51
|
+
[](https://github.com/entropyvector/promptdebug/actions)
|
|
52
|
+
[](https://opensource.org/licenses/MIT)
|
|
53
|
+
[](https://www.python.org/downloads/)
|
|
54
|
+
|
|
55
|
+
Find dead tokens in your system prompts. Ablation-based influence analysis for LLM prompts.
|
|
56
|
+
|
|
57
|
+
promptdebug systematically removes each section of your system prompt and measures how the model's output changes. Sections that can be removed without affecting the output are **dead weight** — tokens you're paying for that do nothing.
|
|
58
|
+
|
|
59
|
+
## Install
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install promptdebug
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
> **Note:** On first run, promptdebug downloads the `all-mpnet-base-v2` sentence-transformers model (~420 MB) for semantic scoring. This happens once and is cached locally by the `sentence-transformers` library.
|
|
66
|
+
|
|
67
|
+
Set your API key for whichever provider you use:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
export OPENAI_API_KEY="sk-..."
|
|
71
|
+
# or
|
|
72
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
73
|
+
# or
|
|
74
|
+
export GEMINI_API_KEY="..."
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Quick Start
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Analyze a system prompt
|
|
81
|
+
promptdebug analyze prompt.txt --query "I want a refund"
|
|
82
|
+
|
|
83
|
+
# HTML report
|
|
84
|
+
promptdebug analyze prompt.txt --query "I want a refund" --format html
|
|
85
|
+
|
|
86
|
+
# Analyze across multiple queries for more robust results
|
|
87
|
+
promptdebug analyze prompt.txt --queries queries.txt
|
|
88
|
+
|
|
89
|
+
# Validate analysis reliability with a counterfactual injection
|
|
90
|
+
promptdebug analyze prompt.txt --query "test" --sanity-check
|
|
91
|
+
|
|
92
|
+
# Get rewrite suggestions for dead sections
|
|
93
|
+
promptdebug analyze prompt.txt --query "test" --suggest
|
|
94
|
+
|
|
95
|
+
# Watch mode — re-analyze automatically on every save
|
|
96
|
+
promptdebug watch prompt.txt --query "test"
|
|
97
|
+
|
|
98
|
+
# Compare influence between git versions
|
|
99
|
+
promptdebug diff prompt.txt --ref HEAD~1 --query "test"
|
|
100
|
+
|
|
101
|
+
# Compare across models
|
|
102
|
+
promptdebug compare prompt.txt --query "test query" --models gpt-4o-mini,claude-haiku-4-5
|
|
103
|
+
|
|
104
|
+
# Strip dead sections and output a cleaned prompt
|
|
105
|
+
promptdebug optimize prompt.txt --query "test query"
|
|
106
|
+
|
|
107
|
+
# Dry run (no API calls, shows cost estimate)
|
|
108
|
+
promptdebug analyze prompt.txt --query "test" --dry-run
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## How It Works
|
|
112
|
+
|
|
113
|
+
1. **Parse** — Your system prompt is split into sections using automatic strategy detection (markdown headers, XML tags, labeled blocks, numbered lists, or paragraph breaks).
|
|
114
|
+
|
|
115
|
+
2. **Baseline** — The full prompt is sent to the model N times to establish baseline outputs.
|
|
116
|
+
|
|
117
|
+
3. **Ablate** — Each section is removed one at a time. The ablated prompt is sent to the model N times.
|
|
118
|
+
|
|
119
|
+
4. **Score** — Each section gets a composite influence score:
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
influence = 0.60 × semantic + 0.20 × structural + 0.20 × behavioral
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
- **Semantic** — cosine distance between sentence embeddings of baseline vs. ablated output
|
|
126
|
+
- **Structural** — character-level diff + paragraph/bullet/code block feature distance
|
|
127
|
+
- **Behavioral** — format-appropriate signals (JSON field match, classification exact match, or surface signals for free text)
|
|
128
|
+
|
|
129
|
+
5. **Classify** — Sections with influence < 0.10 are classified as **dead**.
|
|
130
|
+
|
|
131
|
+
## Output Example
|
|
132
|
+
|
|
133
|
+
```
|
|
134
|
+
Section 1: Role definition [████████ ] 0.82 HIGH
|
|
135
|
+
Section 2: Output format rules [████ ] 0.44 MEDIUM
|
|
136
|
+
Section 3: Tone guidelines [█ ] 0.12 LOW
|
|
137
|
+
Section 4: Legacy constraint note [ ] 0.03 DEAD
|
|
138
|
+
Section 5: Core task instruction [███████ ] 0.71 HIGH
|
|
139
|
+
|
|
140
|
+
Dead token rate: 14.2% (127 / 894 tokens)
|
|
141
|
+
Estimated savings: ~$0.02 per 1K calls
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Commands
|
|
145
|
+
|
|
146
|
+
### `analyze` — influence heatmap for a prompt
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
promptdebug analyze prompt.txt --query "test query"
|
|
150
|
+
|
|
151
|
+
# Options
|
|
152
|
+
--queries FILE Text file with one query per line (multi-query mode)
|
|
153
|
+
--model MODEL LLM to use (default: gpt-4o-mini)
|
|
154
|
+
--runs N API calls per ablation (default: 3)
|
|
155
|
+
--temperature FLOAT Sampling temperature (default: 0.3)
|
|
156
|
+
--format FORMAT terminal | html | json | csv (default: terminal)
|
|
157
|
+
--dead-threshold F Influence below this is dead (default: 0.10)
|
|
158
|
+
--sanity-check Inject a counterfactual section; warn if not detected
|
|
159
|
+
--suggest Generate LLM rewrite suggestions for dead sections
|
|
160
|
+
--dry-run Estimate cost without making API calls
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### `watch` — re-analyze on every file save
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
promptdebug watch prompt.txt --query "test query"
|
|
167
|
+
|
|
168
|
+
# Options
|
|
169
|
+
--interval SECONDS Poll interval in seconds (default: 5)
|
|
170
|
+
--threshold FLOAT Re-print only when dead rate changes by this much
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### `diff` — compare influence between git revisions
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
promptdebug diff prompt.txt --ref HEAD~1 --query "test query"
|
|
177
|
+
|
|
178
|
+
# Options
|
|
179
|
+
--ref REF Git ref to compare against (default: HEAD~1)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### `compare` — side-by-side multi-model comparison
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
promptdebug compare prompt.txt --query "test" --models gpt-4o-mini,claude-haiku-4-5
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### `optimize` — output a cleaned prompt with dead sections removed
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
promptdebug optimize prompt.txt --query "test"
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Output Formats
|
|
195
|
+
|
|
196
|
+
| Format | Flag | Description |
|
|
197
|
+
|--------|------|-------------|
|
|
198
|
+
| Terminal | `--format terminal` | Rich heatmap (default) |
|
|
199
|
+
| HTML | `--format html` | Interactive report, opens in browser |
|
|
200
|
+
| JSON | `--format json` | Machine-readable export |
|
|
201
|
+
| CSV | `--format csv` | Spreadsheet-friendly export |
|
|
202
|
+
|
|
203
|
+
## Multi-Query Mode
|
|
204
|
+
|
|
205
|
+
Single-query analysis can be noisy — a section that looks dead for one query may be critical for another. Multi-query mode runs ablation across several test queries and aggregates the scores, giving a more stable, query-independent result:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
# queries.txt — one query per line
|
|
209
|
+
printf "I want a refund\nMy login is broken\nHow do I cancel?\n" > queries.txt
|
|
210
|
+
promptdebug analyze prompt.txt --queries queries.txt
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Sanity Check
|
|
214
|
+
|
|
215
|
+
Before acting on dead-section results, verify the scoring engine is working correctly for your specific prompt and query. The sanity check injects a known-high-influence instruction and confirms it scores above 0.5. If it doesn't, the analysis may be unreliable:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
promptdebug analyze prompt.txt --query "test" --sanity-check
|
|
219
|
+
# ✓ Sanity check passed (score: 0.73)
|
|
220
|
+
# ⚠ Sanity check failed (score: 0.31) — results may be unreliable for this prompt/query
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Watch Mode
|
|
224
|
+
|
|
225
|
+
Iterate on your prompt and see the influence change in real time:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
promptdebug watch prompt.txt --query "I want a refund" --interval 10
|
|
229
|
+
# Watching prompt.txt (every 10s) ...
|
|
230
|
+
# [14:32:07] Change detected — re-analyzing ...
|
|
231
|
+
# ...heatmap...
|
|
232
|
+
# [14:35:22] Change detected — re-analyzing ...
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Configuration
|
|
236
|
+
|
|
237
|
+
Create a `.promptdebug.yml` in your project directory (or any parent directory):
|
|
238
|
+
|
|
239
|
+
```yaml
|
|
240
|
+
model: gpt-4o-mini
|
|
241
|
+
runs: 3
|
|
242
|
+
temperature: 0.3
|
|
243
|
+
dead_threshold: 0.10
|
|
244
|
+
cache_expire_days: 7
|
|
245
|
+
weights:
|
|
246
|
+
semantic: 0.6
|
|
247
|
+
structural: 0.2
|
|
248
|
+
behavioral: 0.2
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
All fields are optional. Defaults are shown above.
|
|
252
|
+
|
|
253
|
+
## Supported Models
|
|
254
|
+
|
|
255
|
+
Any model supported by [LiteLLM](https://docs.litellm.ai/docs/providers):
|
|
256
|
+
|
|
257
|
+
- **OpenAI**: gpt-4o, gpt-4o-mini, gpt-4-turbo, ...
|
|
258
|
+
- **Anthropic**: claude-sonnet-4-5, claude-haiku-4-5, ...
|
|
259
|
+
- **Google**: gemini/gemini-2.0-flash, gemini/gemini-1.5-pro, ...
|
|
260
|
+
- **Mistral**: mistral/mistral-large-latest, ...
|
|
261
|
+
- **Local**: ollama/llama3, ollama/codellama, ...
|
|
262
|
+
|
|
263
|
+
## Caching
|
|
264
|
+
|
|
265
|
+
API responses are cached in a local SQLite database (`.promptdebug_cache.db`) using SHA256 content-hash keys. Cache auto-expires after 7 days (configurable). Re-running the same analysis costs zero API calls.
|
|
266
|
+
|
|
267
|
+
## Python API
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
import asyncio
|
|
271
|
+
from promptdebug import (
|
|
272
|
+
run_ablation,
|
|
273
|
+
run_ablation_multi_query,
|
|
274
|
+
run_sanity_check,
|
|
275
|
+
generate_all_suggestions,
|
|
276
|
+
render_terminal,
|
|
277
|
+
LLMProvider,
|
|
278
|
+
Cache,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
async def main():
|
|
282
|
+
provider = LLMProvider(model="gpt-4o-mini")
|
|
283
|
+
cache = Cache()
|
|
284
|
+
|
|
285
|
+
# Single-query ablation
|
|
286
|
+
result = await run_ablation(
|
|
287
|
+
prompt_text="You are a helpful assistant. ...",
|
|
288
|
+
query="Hello, how can you help me?",
|
|
289
|
+
provider=provider,
|
|
290
|
+
cache=cache,
|
|
291
|
+
runs=3,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
render_terminal(result, model="gpt-4o-mini", runs=3)
|
|
295
|
+
|
|
296
|
+
# Multi-query ablation (aggregated)
|
|
297
|
+
aggregated, per_query = await run_ablation_multi_query(
|
|
298
|
+
prompt_text="...",
|
|
299
|
+
queries=["query 1", "query 2", "query 3"],
|
|
300
|
+
provider=provider,
|
|
301
|
+
runs=3,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Sanity check — validate scoring reliability
|
|
305
|
+
passed, score = await run_sanity_check(
|
|
306
|
+
prompt_text="...",
|
|
307
|
+
query="test query",
|
|
308
|
+
provider=provider,
|
|
309
|
+
)
|
|
310
|
+
print(f"Sanity check: {'passed' if passed else 'FAILED'} (score={score:.2f})")
|
|
311
|
+
|
|
312
|
+
# Get rewrite suggestions for dead sections
|
|
313
|
+
suggestions = await generate_all_suggestions(
|
|
314
|
+
section_results=result.sections,
|
|
315
|
+
provider=provider,
|
|
316
|
+
threshold=0.2,
|
|
317
|
+
)
|
|
318
|
+
for section_idx, rewrites in suggestions.items():
|
|
319
|
+
print(f"Section {section_idx} suggestions:")
|
|
320
|
+
for s in rewrites:
|
|
321
|
+
print(f" → {s}")
|
|
322
|
+
|
|
323
|
+
asyncio.run(main())
|
|
324
|
+
```
|
|
325
|
+
|
|
326
|
+
## Development
|
|
327
|
+
|
|
328
|
+
```bash
|
|
329
|
+
git clone https://github.com/entropyvector/promptdebug.git
|
|
330
|
+
cd promptdebug
|
|
331
|
+
pip install -e ".[dev]"
|
|
332
|
+
|
|
333
|
+
# Run unit tests (762 tests, no API key required)
|
|
334
|
+
python -m pytest tests/ --ignore=tests/test_integration.py
|
|
335
|
+
|
|
336
|
+
# Run integration tests (requires OPENAI_API_KEY)
|
|
337
|
+
python -m pytest tests/test_integration.py -v
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
## License
|
|
341
|
+
|
|
342
|
+
[MIT](LICENSE)
|
|
343
|
+
|
|
344
|
+
## Third-Party Licenses
|
|
345
|
+
|
|
346
|
+
See [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md) for a full list of dependencies and their licenses.
|