promptdebug 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. promptdebug-0.2.0/.github/ISSUE_TEMPLATE/bug_report.md +25 -0
  2. promptdebug-0.2.0/.github/ISSUE_TEMPLATE/feature_request.md +19 -0
  3. promptdebug-0.2.0/.github/workflows/ci.yml +39 -0
  4. promptdebug-0.2.0/.github/workflows/publish.yml +25 -0
  5. promptdebug-0.2.0/.gitignore +29 -0
  6. promptdebug-0.2.0/CHANGELOG.md +52 -0
  7. promptdebug-0.2.0/LICENSE +21 -0
  8. promptdebug-0.2.0/PKG-INFO +346 -0
  9. promptdebug-0.2.0/README.md +300 -0
  10. promptdebug-0.2.0/THIRD_PARTY_LICENSES.md +101 -0
  11. promptdebug-0.2.0/examples/coding_assistant.txt +48 -0
  12. promptdebug-0.2.0/examples/content_moderator.txt +112 -0
  13. promptdebug-0.2.0/examples/customer_support.txt +61 -0
  14. promptdebug-0.2.0/examples/data_extractor.txt +191 -0
  15. promptdebug-0.2.0/examples/rag_pipeline.txt +58 -0
  16. promptdebug-0.2.0/pyproject.toml +93 -0
  17. promptdebug-0.2.0/src/promptdebug/__init__.py +54 -0
  18. promptdebug-0.2.0/src/promptdebug/ablation.py +568 -0
  19. promptdebug-0.2.0/src/promptdebug/cache.py +125 -0
  20. promptdebug-0.2.0/src/promptdebug/cli.py +743 -0
  21. promptdebug-0.2.0/src/promptdebug/config.py +125 -0
  22. promptdebug-0.2.0/src/promptdebug/parser.py +376 -0
  23. promptdebug-0.2.0/src/promptdebug/providers.py +82 -0
  24. promptdebug-0.2.0/src/promptdebug/renderer.py +419 -0
  25. promptdebug-0.2.0/src/promptdebug/scoring.py +569 -0
  26. promptdebug-0.2.0/tests/__init__.py +0 -0
  27. promptdebug-0.2.0/tests/test_ablation.py +125 -0
  28. promptdebug-0.2.0/tests/test_ablation_stress.py +735 -0
  29. promptdebug-0.2.0/tests/test_cache.py +123 -0
  30. promptdebug-0.2.0/tests/test_cache_stress.py +470 -0
  31. promptdebug-0.2.0/tests/test_cli.py +205 -0
  32. promptdebug-0.2.0/tests/test_config.py +93 -0
  33. promptdebug-0.2.0/tests/test_config_stress.py +477 -0
  34. promptdebug-0.2.0/tests/test_coverage_gaps.py +1535 -0
  35. promptdebug-0.2.0/tests/test_integration.py +485 -0
  36. promptdebug-0.2.0/tests/test_integration_real.py +209 -0
  37. promptdebug-0.2.0/tests/test_new_features_stress.py +1259 -0
  38. promptdebug-0.2.0/tests/test_parser.py +137 -0
  39. promptdebug-0.2.0/tests/test_parser_stress.py +769 -0
  40. promptdebug-0.2.0/tests/test_providers.py +288 -0
  41. promptdebug-0.2.0/tests/test_renderer.py +69 -0
  42. promptdebug-0.2.0/tests/test_renderer_stress.py +861 -0
  43. promptdebug-0.2.0/tests/test_scoring.py +112 -0
  44. promptdebug-0.2.0/tests/test_scoring_stress.py +1011 -0
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: Bug report
3
+ about: Report a bug in promptdebug
4
+ title: ''
5
+ labels: bug
6
+ assignees: ''
7
+ ---
8
+
9
+ **Describe the bug**
10
+ A clear description of what the bug is.
11
+
12
+ **To Reproduce**
13
+ ```bash
14
+ # Minimal command to reproduce
15
+ promptdebug analyze prompt.txt --query "..."
16
+ ```
17
+
18
+ **Expected behavior**
19
+ What you expected to happen.
20
+
21
+ **Environment**
22
+ - promptdebug version:
23
+ - Python version:
24
+ - OS:
25
+ - Model used:
@@ -0,0 +1,19 @@
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for promptdebug
4
+ title: ''
5
+ labels: enhancement
6
+ assignees: ''
7
+ ---
8
+
9
+ **Is your feature request related to a problem?**
10
+ A clear description of the problem.
11
+
12
+ **Describe the solution you'd like**
13
+ What you want to happen.
14
+
15
+ **Describe alternatives you've considered**
16
+ Other solutions you've thought about.
17
+
18
+ **Additional context**
19
+ Any other context about the feature request.
@@ -0,0 +1,39 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ python-version: ["3.10", "3.11", "3.12", "3.13", "3.14-dev"]
16
+ include:
17
+ - python-version: "3.14-dev"
18
+ experimental: true
19
+
20
+ continue-on-error: ${{ matrix.experimental || false }}
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ - name: Set up Python ${{ matrix.python-version }}
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: ${{ matrix.python-version }}
27
+ allow-prereleases: true
28
+ - name: Install dependencies
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ pip install -e ".[dev]"
32
+ - name: Lint with ruff
33
+ run: ruff check src/
34
+ - name: Type check with mypy
35
+ run: mypy src/
36
+ - name: Run tests
37
+ run: pytest tests/ --ignore=tests/test_integration.py --ignore=tests/test_integration_real.py
38
+ env:
39
+ NO_COLOR: "1"
@@ -0,0 +1,25 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ environment: pypi
11
+ permissions:
12
+ id-token: write # Required for trusted publishing
13
+
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.11"
20
+ - name: Install build dependencies
21
+ run: pip install build hatchling
22
+ - name: Build package
23
+ run: python -m build
24
+ - name: Publish to PyPI
25
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,29 @@
1
+ .env
2
+ .env.*
3
+ .venv/
4
+ __pycache__/
5
+ *.pyc
6
+ *.pyo
7
+ *.egg-info/
8
+ *.egg
9
+ dist/
10
+ build/
11
+ .promptdebug_cache.db
12
+ .promptdebug_cache.db-shm
13
+ .promptdebug_cache.db-wal
14
+ .pytest_cache/
15
+ .mypy_cache/
16
+ .ruff_cache/
17
+ .coverage
18
+ htmlcov/
19
+ *.so
20
+ *.log
21
+ *.swp
22
+ *.swo
23
+ .DS_Store
24
+ .claude/
25
+ .idea/
26
+ .vscode/
27
+ docs/
28
+ pilot/.venv/
29
+ pilot/results/
@@ -0,0 +1,52 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.2.0] - 2026-03-08
9
+
10
+ ### Added
11
+
12
+ - **Multi-query ablation** (`--queries queries.txt`): run ablation across multiple test queries and aggregate influence scores, giving a more robust, query-independent result
13
+ - **Sanity check** (`--sanity-check`): inject a known-high-influence counterfactual section and verify the scoring engine detects it — flags unreliable analyses before you act on them
14
+ - **Rewrite suggestions** (`--suggest`): for every dead section, generate LLM-powered replacement candidates via `generate_all_suggestions`
15
+ - **Watch mode** (`promptdebug watch prompt.txt --query "..."`): poll a prompt file and re-run analysis automatically on every save — no more manual re-runs during iteration
16
+ - **Diff command** (`promptdebug diff prompt.txt --ref HEAD~1 --query "..."`): compare influence scores between the current prompt and any git revision
17
+ - **Integration test suite** (`tests/test_integration.py`): 27 tests that make live OpenAI API calls — covers provider round-trips, ablation structure, multi-query aggregation, sanity check, suggestions, cache, and the full customer-support pipeline
18
+ - **Coverage gap tests** (`tests/test_coverage_gaps.py`): 187 tests filling every untested branch across all 8 modules (scoring, parser, renderer, config, providers, ablation, cli, cache)
19
+ - **Stress tests for new features** (`tests/test_new_features_stress.py`): 68 adversarial tests targeting the 5 new features
20
+
21
+ ### Infrastructure
22
+
23
+ - Added CI workflow (GitHub Actions) running tests on Python 3.10–3.14 with ruff and mypy
24
+ - Added trusted-publisher workflow for automatic PyPI release on GitHub release events
25
+ - Added GitHub issue templates (bug report, feature request)
26
+ - Added `ruff`, `mypy`, `types-PyYAML`, `build`, `twine` to dev dependencies
27
+ - Fixed `.gitignore` to exclude SQLite WAL files (`.db-shm`, `.db-wal`)
28
+
29
+ ### Fixed
30
+
31
+ - `np.mean([])` crash in multi-query aggregation when a per-query run has fewer sections than the reference — now falls back gracefully to the reference score
32
+ - `generate_all_suggestions` was executing sequentially despite using `asyncio.Semaphore` — fixed to use `asyncio.gather` for true parallelism
33
+ - `watch` command triggered an immediate spurious analysis on startup — fixed by initialising `last_mtime` / `last_content` from the actual file state
34
+ - `analyze` command read the queries file twice (once for sanity check, once for multi-query) — consolidated to a single pre-read
35
+ - `diff` command failed with absolute paths — `git show ref:/absolute/path` always errors; fixed by resolving the repo-relative path via `git rev-parse --show-toplevel`
36
+ - Sanity check section was merged into the last existing section by the parser — fixed by adding a `## __calibration_control__` markdown header so the parser creates a distinct section
37
+
38
+ ## [0.1.0] - 2026-03-07
39
+
40
+ ### Added
41
+
42
+ - Core ablation engine with leave-one-out section removal
43
+ - Composite influence scoring: semantic (0.60) + structural (0.20) + behavioral (0.20)
44
+ - Semantic similarity via sentence-transformers (all-mpnet-base-v2)
45
+ - Smart prompt parsing with 6 strategies: markdown headers, XML tags, labeled blocks, numbered lists, double newlines, single-section fallback
46
+ - CLI commands: `analyze`, `compare`, `optimize`
47
+ - Output formats: terminal heatmap, HTML report, JSON, CSV
48
+ - SQLite-based response cache with SHA256 content-hash keys
49
+ - Multi-model support via LiteLLM (OpenAI, Anthropic, Google, Mistral, Ollama)
50
+ - Dry-run mode with cost estimation
51
+ - YAML configuration file support (`.promptdebug.yml`)
52
+ - 503 tests with full coverage of all modules
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Zaur Jafarov
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,346 @@
1
+ Metadata-Version: 2.4
2
+ Name: promptdebug
3
+ Version: 0.2.0
4
+ Summary: Find dead tokens in your system prompts. Ablation-based influence analysis for LLM prompts.
5
+ Project-URL: Homepage, https://github.com/entropyvector/promptdebug
6
+ Project-URL: Documentation, https://github.com/entropyvector/promptdebug#readme
7
+ Project-URL: Repository, https://github.com/entropyvector/promptdebug
8
+ Project-URL: Issues, https://github.com/entropyvector/promptdebug/issues
9
+ Project-URL: Changelog, https://github.com/entropyvector/promptdebug/blob/main/CHANGELOG.md
10
+ Author-email: Zaur Jafarov <entropyvector.dev@gmail.com>
11
+ License-Expression: MIT
12
+ License-File: LICENSE
13
+ Keywords: ablation,debugging,llm,optimization,prompt
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Programming Language :: Python :: 3.14
24
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
25
+ Classifier: Topic :: Software Development :: Testing
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.10
28
+ Requires-Dist: jinja2>=3.1.0
29
+ Requires-Dist: litellm>=1.40.0
30
+ Requires-Dist: numpy>=1.26.0
31
+ Requires-Dist: pyyaml>=6.0
32
+ Requires-Dist: rich>=13.0.0
33
+ Requires-Dist: sentence-transformers>=3.0.0
34
+ Requires-Dist: tiktoken>=0.7.0
35
+ Requires-Dist: typer>=0.12.0
36
+ Provides-Extra: dev
37
+ Requires-Dist: build>=1.0; extra == 'dev'
38
+ Requires-Dist: mypy>=1.0; extra == 'dev'
39
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
40
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
41
+ Requires-Dist: python-dotenv>=1.0.0; extra == 'dev'
42
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
43
+ Requires-Dist: twine>=4.0; extra == 'dev'
44
+ Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
45
+ Description-Content-Type: text/markdown
46
+
47
+ # promptdebug
48
+
49
+ [![PyPI version](https://img.shields.io/pypi/v/promptdebug.svg)](https://pypi.org/project/promptdebug/)
50
+ [![Downloads](https://pepy.tech/badge/promptdebug)](https://pepy.tech/project/promptdebug)
51
+ [![CI](https://github.com/entropyvector/promptdebug/actions/workflows/ci.yml/badge.svg)](https://github.com/entropyvector/promptdebug/actions)
52
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
53
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
54
+
55
+ Find dead tokens in your system prompts. Ablation-based influence analysis for LLM prompts.
56
+
57
+ promptdebug systematically removes each section of your system prompt and measures how the model's output changes. Sections that can be removed without affecting the output are **dead weight** — tokens you're paying for that do nothing.
58
+
59
+ ## Install
60
+
61
+ ```bash
62
+ pip install promptdebug
63
+ ```
64
+
65
+ > **Note:** On first run, promptdebug downloads the `all-mpnet-base-v2` sentence-transformers model (~420 MB) for semantic scoring. This happens once and is cached locally by the `sentence-transformers` library.
66
+
67
+ Set your API key for whichever provider you use:
68
+
69
+ ```bash
70
+ export OPENAI_API_KEY="sk-..."
71
+ # or
72
+ export ANTHROPIC_API_KEY="sk-ant-..."
73
+ # or
74
+ export GEMINI_API_KEY="..."
75
+ ```
76
+
77
+ ## Quick Start
78
+
79
+ ```bash
80
+ # Analyze a system prompt
81
+ promptdebug analyze prompt.txt --query "I want a refund"
82
+
83
+ # HTML report
84
+ promptdebug analyze prompt.txt --query "I want a refund" --format html
85
+
86
+ # Analyze across multiple queries for more robust results
87
+ promptdebug analyze prompt.txt --queries queries.txt
88
+
89
+ # Validate analysis reliability with a counterfactual injection
90
+ promptdebug analyze prompt.txt --query "test" --sanity-check
91
+
92
+ # Get rewrite suggestions for dead sections
93
+ promptdebug analyze prompt.txt --query "test" --suggest
94
+
95
+ # Watch mode — re-analyze automatically on every save
96
+ promptdebug watch prompt.txt --query "test"
97
+
98
+ # Compare influence between git versions
99
+ promptdebug diff prompt.txt --ref HEAD~1 --query "test"
100
+
101
+ # Compare across models
102
+ promptdebug compare prompt.txt --query "test query" --models gpt-4o-mini,claude-haiku-4-5
103
+
104
+ # Strip dead sections and output a cleaned prompt
105
+ promptdebug optimize prompt.txt --query "test query"
106
+
107
+ # Dry run (no API calls, shows cost estimate)
108
+ promptdebug analyze prompt.txt --query "test" --dry-run
109
+ ```
110
+
111
+ ## How It Works
112
+
113
+ 1. **Parse** — Your system prompt is split into sections using automatic strategy detection (markdown headers, XML tags, labeled blocks, numbered lists, or paragraph breaks).
114
+
115
+ 2. **Baseline** — The full prompt is sent to the model N times to establish baseline outputs.
116
+
117
+ 3. **Ablate** — Each section is removed one at a time. The ablated prompt is sent to the model N times.
118
+
119
+ 4. **Score** — Each section gets a composite influence score:
120
+
121
+ ```
122
+ influence = 0.60 × semantic + 0.20 × structural + 0.20 × behavioral
123
+ ```
124
+
125
+ - **Semantic** — cosine distance between sentence embeddings of baseline vs. ablated output
126
+ - **Structural** — character-level diff + paragraph/bullet/code block feature distance
127
+ - **Behavioral** — format-appropriate signals (JSON field match, classification exact match, or surface signals for free text)
128
+
129
+ 5. **Classify** — Sections with influence < 0.10 are classified as **dead**.
130
+
131
+ ## Output Example
132
+
133
+ ```
134
+ Section 1: Role definition [████████ ] 0.82 HIGH
135
+ Section 2: Output format rules [████ ] 0.44 MEDIUM
136
+ Section 3: Tone guidelines [█ ] 0.12 LOW
137
+ Section 4: Legacy constraint note [ ] 0.03 DEAD
138
+ Section 5: Core task instruction [███████ ] 0.71 HIGH
139
+
140
+ Dead token rate: 14.2% (127 / 894 tokens)
141
+ Estimated savings: ~$0.02 per 1K calls
142
+ ```
143
+
144
+ ## Commands
145
+
146
+ ### `analyze` — influence heatmap for a prompt
147
+
148
+ ```bash
149
+ promptdebug analyze prompt.txt --query "test query"
150
+
151
+ # Options
152
+ --queries FILE Text file with one query per line (multi-query mode)
153
+ --model MODEL LLM to use (default: gpt-4o-mini)
154
+ --runs N API calls per ablation (default: 3)
155
+ --temperature FLOAT Sampling temperature (default: 0.3)
156
+ --format FORMAT terminal | html | json | csv (default: terminal)
157
+ --dead-threshold F Influence below this is dead (default: 0.10)
158
+ --sanity-check Inject a counterfactual section; warn if not detected
159
+ --suggest Generate LLM rewrite suggestions for dead sections
160
+ --dry-run Estimate cost without making API calls
161
+ ```
162
+
163
+ ### `watch` — re-analyze on every file save
164
+
165
+ ```bash
166
+ promptdebug watch prompt.txt --query "test query"
167
+
168
+ # Options
169
+ --interval SECONDS Poll interval in seconds (default: 5)
170
+ --threshold FLOAT Re-print only when dead rate changes by this much
171
+ ```
172
+
173
+ ### `diff` — compare influence between git revisions
174
+
175
+ ```bash
176
+ promptdebug diff prompt.txt --ref HEAD~1 --query "test query"
177
+
178
+ # Options
179
+ --ref REF Git ref to compare against (default: HEAD~1)
180
+ ```
181
+
182
+ ### `compare` — side-by-side multi-model comparison
183
+
184
+ ```bash
185
+ promptdebug compare prompt.txt --query "test" --models gpt-4o-mini,claude-haiku-4-5
186
+ ```
187
+
188
+ ### `optimize` — output a cleaned prompt with dead sections removed
189
+
190
+ ```bash
191
+ promptdebug optimize prompt.txt --query "test"
192
+ ```
193
+
194
+ ## Output Formats
195
+
196
+ | Format | Flag | Description |
197
+ |--------|------|-------------|
198
+ | Terminal | `--format terminal` | Rich heatmap (default) |
199
+ | HTML | `--format html` | Interactive report, opens in browser |
200
+ | JSON | `--format json` | Machine-readable export |
201
+ | CSV | `--format csv` | Spreadsheet-friendly export |
202
+
203
+ ## Multi-Query Mode
204
+
205
+ Single-query analysis can be noisy — a section that looks dead for one query may be critical for another. Multi-query mode runs ablation across several test queries and aggregates the scores, giving a more stable, query-independent result:
206
+
207
+ ```bash
208
+ # queries.txt — one query per line
209
+ printf "I want a refund\nMy login is broken\nHow do I cancel?\n" > queries.txt
210
+ promptdebug analyze prompt.txt --queries queries.txt
211
+ ```
212
+
213
+ ## Sanity Check
214
+
215
+ Before acting on dead-section results, verify the scoring engine is working correctly for your specific prompt and query. The sanity check injects a known-high-influence instruction and confirms it scores above 0.5. If it doesn't, the analysis may be unreliable:
216
+
217
+ ```bash
218
+ promptdebug analyze prompt.txt --query "test" --sanity-check
219
+ # ✓ Sanity check passed (score: 0.73)
220
+ # ⚠ Sanity check failed (score: 0.31) — results may be unreliable for this prompt/query
221
+ ```
222
+
223
+ ## Watch Mode
224
+
225
+ Iterate on your prompt and see the influence change in real time:
226
+
227
+ ```bash
228
+ promptdebug watch prompt.txt --query "I want a refund" --interval 10
229
+ # Watching prompt.txt (every 10s) ...
230
+ # [14:32:07] Change detected — re-analyzing ...
231
+ # ...heatmap...
232
+ # [14:35:22] Change detected — re-analyzing ...
233
+ ```
234
+
235
+ ## Configuration
236
+
237
+ Create a `.promptdebug.yml` in your project directory (or any parent directory):
238
+
239
+ ```yaml
240
+ model: gpt-4o-mini
241
+ runs: 3
242
+ temperature: 0.3
243
+ dead_threshold: 0.10
244
+ cache_expire_days: 7
245
+ weights:
246
+ semantic: 0.6
247
+ structural: 0.2
248
+ behavioral: 0.2
249
+ ```
250
+
251
+ All fields are optional. Defaults are shown above.
252
+
253
+ ## Supported Models
254
+
255
+ Any model supported by [LiteLLM](https://docs.litellm.ai/docs/providers):
256
+
257
+ - **OpenAI**: gpt-4o, gpt-4o-mini, gpt-4-turbo, ...
258
+ - **Anthropic**: claude-sonnet-4-5, claude-haiku-4-5, ...
259
+ - **Google**: gemini/gemini-2.0-flash, gemini/gemini-1.5-pro, ...
260
+ - **Mistral**: mistral/mistral-large-latest, ...
261
+ - **Local**: ollama/llama3, ollama/codellama, ...
262
+
263
+ ## Caching
264
+
265
+ API responses are cached in a local SQLite database (`.promptdebug_cache.db`) using SHA256 content-hash keys. Cache auto-expires after 7 days (configurable). Re-running the same analysis costs zero API calls.
266
+
267
+ ## Python API
268
+
269
+ ```python
270
+ import asyncio
271
+ from promptdebug import (
272
+ run_ablation,
273
+ run_ablation_multi_query,
274
+ run_sanity_check,
275
+ generate_all_suggestions,
276
+ render_terminal,
277
+ LLMProvider,
278
+ Cache,
279
+ )
280
+
281
+ async def main():
282
+ provider = LLMProvider(model="gpt-4o-mini")
283
+ cache = Cache()
284
+
285
+ # Single-query ablation
286
+ result = await run_ablation(
287
+ prompt_text="You are a helpful assistant. ...",
288
+ query="Hello, how can you help me?",
289
+ provider=provider,
290
+ cache=cache,
291
+ runs=3,
292
+ )
293
+
294
+ render_terminal(result, model="gpt-4o-mini", runs=3)
295
+
296
+ # Multi-query ablation (aggregated)
297
+ aggregated, per_query = await run_ablation_multi_query(
298
+ prompt_text="...",
299
+ queries=["query 1", "query 2", "query 3"],
300
+ provider=provider,
301
+ runs=3,
302
+ )
303
+
304
+ # Sanity check — validate scoring reliability
305
+ passed, score = await run_sanity_check(
306
+ prompt_text="...",
307
+ query="test query",
308
+ provider=provider,
309
+ )
310
+ print(f"Sanity check: {'passed' if passed else 'FAILED'} (score={score:.2f})")
311
+
312
+ # Get rewrite suggestions for dead sections
313
+ suggestions = await generate_all_suggestions(
314
+ section_results=result.sections,
315
+ provider=provider,
316
+ threshold=0.2,
317
+ )
318
+ for section_idx, rewrites in suggestions.items():
319
+ print(f"Section {section_idx} suggestions:")
320
+ for s in rewrites:
321
+ print(f" → {s}")
322
+
323
+ asyncio.run(main())
324
+ ```
325
+
326
+ ## Development
327
+
328
+ ```bash
329
+ git clone https://github.com/entropyvector/promptdebug.git
330
+ cd promptdebug
331
+ pip install -e ".[dev]"
332
+
333
+ # Run unit tests (762 tests, no API key required)
334
+ python -m pytest tests/ --ignore=tests/test_integration.py
335
+
336
+ # Run integration tests (requires OPENAI_API_KEY)
337
+ python -m pytest tests/test_integration.py -v
338
+ ```
339
+
340
+ ## License
341
+
342
+ [MIT](LICENSE)
343
+
344
+ ## Third-Party Licenses
345
+
346
+ See [THIRD_PARTY_LICENSES.md](THIRD_PARTY_LICENSES.md) for a full list of dependencies and their licenses.