breakpoint-library 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. breakpoint_library-0.1.1/PKG-INFO +371 -0
  2. breakpoint_library-0.1.1/README.md +359 -0
  3. breakpoint_library-0.1.1/breakpoint/__init__.py +4 -0
  4. breakpoint_library-0.1.1/breakpoint/cli/__init__.py +1 -0
  5. breakpoint_library-0.1.1/breakpoint/cli/main.py +896 -0
  6. breakpoint_library-0.1.1/breakpoint/config/__init__.py +1 -0
  7. breakpoint_library-0.1.1/breakpoint/config/default_policies.json +69 -0
  8. breakpoint_library-0.1.1/breakpoint/config/presets/__init__.py +2 -0
  9. breakpoint_library-0.1.1/breakpoint/config/presets/chatbot.json +18 -0
  10. breakpoint_library-0.1.1/breakpoint/config/presets/extraction.json +18 -0
  11. breakpoint_library-0.1.1/breakpoint/config/presets/support.json +18 -0
  12. breakpoint_library-0.1.1/breakpoint/engine/__init__.py +3 -0
  13. breakpoint_library-0.1.1/breakpoint/engine/aggregator.py +90 -0
  14. breakpoint_library-0.1.1/breakpoint/engine/config.py +226 -0
  15. breakpoint_library-0.1.1/breakpoint/engine/errors.py +3 -0
  16. breakpoint_library-0.1.1/breakpoint/engine/evaluator.py +264 -0
  17. breakpoint_library-0.1.1/breakpoint/engine/metrics.py +206 -0
  18. breakpoint_library-0.1.1/breakpoint/engine/plugins.py +29 -0
  19. breakpoint_library-0.1.1/breakpoint/engine/policies/__init__.py +1 -0
  20. breakpoint_library-0.1.1/breakpoint/engine/policies/base.py +10 -0
  21. breakpoint_library-0.1.1/breakpoint/engine/policies/cost.py +97 -0
  22. breakpoint_library-0.1.1/breakpoint/engine/policies/drift.py +163 -0
  23. breakpoint_library-0.1.1/breakpoint/engine/policies/latency.py +95 -0
  24. breakpoint_library-0.1.1/breakpoint/engine/policies/output_contract.py +134 -0
  25. breakpoint_library-0.1.1/breakpoint/engine/policies/pii.py +68 -0
  26. breakpoint_library-0.1.1/breakpoint/engine/policies/red_team.py +49 -0
  27. breakpoint_library-0.1.1/breakpoint/engine/reason_codes.py +28 -0
  28. breakpoint_library-0.1.1/breakpoint/engine/waivers.py +161 -0
  29. breakpoint_library-0.1.1/breakpoint/models/__init__.py +3 -0
  30. breakpoint_library-0.1.1/breakpoint/models/decision.py +29 -0
  31. breakpoint_library-0.1.1/breakpoint/pytest_plugin.py +102 -0
  32. breakpoint_library-0.1.1/breakpoint_library.egg-info/PKG-INFO +371 -0
  33. breakpoint_library-0.1.1/breakpoint_library.egg-info/SOURCES.txt +51 -0
  34. breakpoint_library-0.1.1/breakpoint_library.egg-info/dependency_links.txt +1 -0
  35. breakpoint_library-0.1.1/breakpoint_library.egg-info/entry_points.txt +5 -0
  36. breakpoint_library-0.1.1/breakpoint_library.egg-info/requires.txt +7 -0
  37. breakpoint_library-0.1.1/breakpoint_library.egg-info/top_level.txt +1 -0
  38. breakpoint_library-0.1.1/pyproject.toml +35 -0
  39. breakpoint_library-0.1.1/setup.cfg +4 -0
  40. breakpoint_library-0.1.1/tests/test_baseline_lifecycle.py +33 -0
  41. breakpoint_library-0.1.1/tests/test_ci_templates.py +23 -0
  42. breakpoint_library-0.1.1/tests/test_cli.py +783 -0
  43. breakpoint_library-0.1.1/tests/test_cli_metrics.py +71 -0
  44. breakpoint_library-0.1.1/tests/test_evaluate.py +303 -0
  45. breakpoint_library-0.1.1/tests/test_install_worthy_examples.py +34 -0
  46. breakpoint_library-0.1.1/tests/test_metrics.py +138 -0
  47. breakpoint_library-0.1.1/tests/test_output_contract_recursive.py +44 -0
  48. breakpoint_library-0.1.1/tests/test_packaging.py +19 -0
  49. breakpoint_library-0.1.1/tests/test_plugins.py +18 -0
  50. breakpoint_library-0.1.1/tests/test_pytest_plugin.py +72 -0
  51. breakpoint_library-0.1.1/tests/test_quickstart_samples.py +26 -0
  52. breakpoint_library-0.1.1/tests/test_red_team_policy.py +50 -0
  53. breakpoint_library-0.1.1/tests/test_waivers.py +128 -0
@@ -0,0 +1,371 @@
1
+ Metadata-Version: 2.4
2
+ Name: breakpoint-library
3
+ Version: 0.1.1
4
+ Summary: Local-first decision engine for baseline vs candidate LLM output checks.
5
+ Requires-Python: <3.13,>=3.10
6
+ Description-Content-Type: text/markdown
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
9
+ Provides-Extra: ml
10
+ Requires-Dist: sentence-transformers>=2.2.2; extra == "ml"
11
+ Requires-Dist: torch>=2.0.0; extra == "ml"
12
+
13
+ # BreakPoint Library
14
+
15
+ Prevent bad AI releases before they hit production.
16
+
17
+ You change a model.
18
+ The output looks fine.
19
+ But:
20
+ - Cost jumps +38%.
21
+ - A phone number slips into the response.
22
+ - The format breaks your downstream parser.
23
+
24
+ BreakPoint catches it before you deploy.
25
+
26
+ It runs locally.
27
+ Policy evaluation is deterministic from your saved artifacts.
28
+ It gives you one clear answer:
29
+
30
+ `ALLOW` · `WARN` · `BLOCK`
31
+
32
+ ## Quick Example
33
+
34
+ ```bash
35
+ breakpoint evaluate baseline.json candidate.json
36
+ ```
37
+
38
+ ```text
39
+ STATUS: BLOCK
40
+
41
+ Reasons:
42
+ - Cost increased by 38% (baseline: 1,000 tokens -> candidate: 1,380)
43
+ - Detected US phone number pattern
44
+ ```
45
+
46
+ Ship with confidence.
47
+
48
+ ## Lite First (Default)
49
+
50
+ Default command:
51
+
52
+ ```bash
53
+ breakpoint evaluate baseline.json candidate.json
54
+ ```
55
+
56
+ Lite mode defaults:
57
+ - Cost: `WARN` at `+20%`, `BLOCK` at `+40%`
58
+ - PII: `BLOCK` on first detection (email, phone, credit card)
59
+ - Drift: `WARN` at `+35%` length delta, `BLOCK` at `+70%`
60
+ - Empty output: always `BLOCK`
61
+
62
+ Lite is local, deterministic, and zero-config.
63
+
64
+ ## Full Mode (Optional Advanced)
65
+
66
+ Use Full mode when you want config-driven governance and CI enforcement:
67
+
68
+ ```bash
69
+ breakpoint evaluate baseline.json candidate.json --mode full --json --fail-on warn
70
+ ```
71
+
72
+ Full mode adds advanced controls:
73
+ - Output contract enforcement
74
+ - Latency policy
75
+ - Presets/environments
76
+ - Waivers
77
+ - Custom pricing models
78
+
79
+ Example terminal output when those controls are in play (output contract, latency, cost, drift):
80
+
81
+ ```bash
82
+ breakpoint evaluate examples/install_worthy/baseline.json examples/install_worthy/candidate_format_regression.json --mode full
83
+ ```
84
+
85
+ ```text
86
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
87
+ BreakPoint Evaluation
88
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
89
+
90
+ Mode: full
91
+
92
+ Final Decision: BLOCK
93
+
94
+ Policy Results:
95
+ ✓ No PII detected: No matches.
96
+ ✗ Response format: Invalid JSON detected (1).
97
+ ✓ Cost: No issues.
98
+ ✓ Latency: Delta +2.78%.
99
+ ✗ Output drift: Length delta +72.86%, similarity 0.067164.
100
+
101
+ Summary:
102
+ - Output contract break: candidate output is not valid JSON.
103
+ - Response length compressed: baseline 140 chars vs candidate 38 chars (72.9%, block threshold 70%).
104
+ 2 additional non-blocking signal(s) detected.
105
+
106
+ Exit Code: 0
107
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
108
+ ```
109
+
110
+ Here **Response format** = output contract enforcement; **Latency** = latency policy; **Cost** can use custom pricing when you pass token counts + model. Presets/environments and waivers change thresholds and appear in the same layout.
111
+
112
+ **Example: human-readable output (Full mode)**
113
+
114
+ ```bash
115
+ breakpoint evaluate examples/quickstart/baseline.json examples/quickstart/candidate_warn.json --mode full
116
+ ```
117
+
118
+ ```text
119
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
120
+ BreakPoint Evaluation
121
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
122
+
123
+ Mode: full
124
+
125
+ Final Decision: WARN
126
+
127
+ Policy Results:
128
+ ✓ No PII detected: No matches.
129
+ ✓ Response format: No schema drift detected.
130
+ ⚠ Cost: Delta +25.00%.
131
+ ⚠ Latency: Delta +35.00%.
132
+ ⚠ Output drift: Length delta +52.17%, similarity 0.400000.
133
+
134
+ Summary:
135
+ Cost increased by 25.0% (>=20%).
136
+ 2 additional signal(s) detected.
137
+
138
+ Exit Code: 0
139
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
140
+ ```
141
+
142
+ **Example: JSON output (Full mode, for CI)**
143
+
144
+ ```bash
145
+ breakpoint evaluate examples/quickstart/baseline.json examples/quickstart/candidate_warn.json --mode full --config examples/quickstart/custom_policy.json --env dev --json
146
+ ```
147
+
148
+ ```json
149
+ {
150
+ "schema_version": "1.0.0",
151
+ "status": "BLOCK",
152
+ "reasons": [
153
+ "Cost increased by 25.0% (>=20%).",
154
+ "Latency increased by 35.0% (>20%).",
155
+ "Response length expanded: baseline 46 chars vs candidate 70 chars (52.2%, threshold 35%)."
156
+ ],
157
+ "reason_codes": [
158
+ "COST_INCREASE_BLOCK",
159
+ "LATENCY_INCREASE_WARN",
160
+ "DRIFT_LENGTH_WARN"
161
+ ],
162
+ "metrics": {
163
+ "cost_delta_pct": 25.0,
164
+ "cost_delta_usd": 0.25,
165
+ "latency_delta_pct": 35.0,
166
+ "latency_delta_ms": 35.0,
167
+ "length_delta_pct": 52.1739,
168
+ "similarity": 0.4
169
+ },
170
+ "metadata": {
171
+ "strict": false,
172
+ "mode": "full",
173
+ "baseline_model": "gpt-4.1-mini",
174
+ "candidate_model": "gpt-4.1-mini",
175
+ "ci": true
176
+ }
177
+ }
178
+ ```
179
+
180
+ (With stricter env thresholds, the same candidate can yield `BLOCK` and non-zero exit code.)
181
+
182
+ **What you see in the terminal**
183
+
184
+ When you run without `--json`, the CLI prints the human-readable block above: a divider line, **Mode**, **Final Decision** (ALLOW / WARN / BLOCK), **Policy Results** (✓ ⚠ ✗ per policy), **Summary**, and **Exit Code**. There are no colors or external UI libraries—plain text and Unicode symbols only so it stays dependency-free and CI-friendly. A richer terminal UI (e.g. colored status, panels, or metric tables) was considered but deprioritized for cost and simplicity; the current output is the supported terminal experience.
185
+
186
+ ## CI First (Recommended)
187
+
188
+ ```bash
189
+ breakpoint evaluate baseline.json candidate.json --json --fail-on warn
190
+ ```
191
+
192
+ Why this is the default integration path:
193
+ - Machine-readable decision payload (`schema_version`, `status`, `reason_codes`, metrics).
194
+ - Non-zero exit code on risky changes.
195
+ - Easy to wire into existing CI without additional services.
196
+
197
+ Default policy posture (out of the box, Lite):
198
+ - Cost: `WARN` at `+20%`, `BLOCK` at `+40%`
199
+ - PII: `BLOCK` on first detection
200
+ - Drift: `WARN` at `+35%`, `BLOCK` at `+70%`
201
+
202
+ ### Copy-Paste GitHub Actions Gate
203
+
204
+ Use the template:
205
+ - `examples/ci/github-actions-breakpoint.yml`
206
+
207
+ Copy it to:
208
+ - `.github/workflows/breakpoint-gate.yml`
209
+
210
+ What `--fail-on warn` means:
211
+ - Any `WARN` or `BLOCK` fails the CI step.
212
+ - Exit behavior remains deterministic: `ALLOW=0`, `WARN=1`, `BLOCK=2`.
213
+
214
+ If you only want to fail on `BLOCK`, change:
215
+ - `BREAKPOINT_FAIL_ON: warn`
216
+ to:
217
+ - `BREAKPOINT_FAIL_ON: block`
218
+
219
+ ## Try In 60 Seconds
220
+
221
+ ```bash
222
+ pip install -e .
223
+ make demo
224
+ ```
225
+
226
+ What you should see:
227
+ - Scenario A: `BLOCK` (cost spike)
228
+ - Scenario B: `BLOCK` (format/contract regression)
229
+ - Scenario C: `BLOCK` (PII + verbosity drift)
230
+ - Scenario D: `BLOCK` (small prompt change -> cost blowup)
231
+
232
+ ## Four Realistic Examples
233
+
234
+ Baseline for all examples:
235
+ - `examples/install_worthy/baseline.json`
236
+
237
+ ### 1) Cost regression after model swap
238
+
239
+ ```bash
240
+ breakpoint evaluate examples/install_worthy/baseline.json examples/install_worthy/candidate_cost_model_swap.json
241
+ ```
242
+
243
+ Expected: `BLOCK`
244
+ Why it matters: output appears equivalent, but cost increases enough to violate policy.
245
+
246
+ ### 2) Structured-output behavior regression
247
+
248
+ ```bash
249
+ breakpoint evaluate examples/install_worthy/baseline.json examples/install_worthy/candidate_format_regression.json
250
+ ```
251
+
252
+ Expected: `BLOCK`
253
+ Why it matters: candidate drops expected structure and drifts from baseline behavior.
254
+
255
+ ### 3) PII appears in candidate output
256
+
257
+ ```bash
258
+ breakpoint evaluate examples/install_worthy/baseline.json examples/install_worthy/candidate_pii_verbosity.json
259
+ ```
260
+
261
+ Expected: `BLOCK`
262
+ Why it matters: candidate introduces PII and adds verbosity drift.
263
+
264
+ ### 4) Small prompt change -> big cost blowup
265
+
266
+ ```bash
267
+ breakpoint evaluate examples/install_worthy/baseline.json examples/install_worthy/candidate_killer_tradeoff.json
268
+ ```
269
+
270
+ Expected: `BLOCK`
271
+ Why it matters: output still looks workable, but detail-heavy prompt changes plus a model upgrade create large cost and latency increases with output-contract drift.
272
+
273
+ More scenario details:
274
+ - `docs/install-worthy-examples.md`
275
+
276
+ ## CLI
277
+
278
+ Evaluate two JSON files:
279
+
280
+ ```bash
281
+ breakpoint evaluate baseline.json candidate.json
282
+ ```
283
+
284
+ Evaluate a single combined JSON file:
285
+
286
+ ```bash
287
+ breakpoint evaluate payload.json
288
+ ```
289
+
290
+ JSON output for CI/parsing:
291
+
292
+ ```bash
293
+ breakpoint evaluate baseline.json candidate.json --json
294
+ ```
295
+
296
+ Exit-code gating options:
297
+
298
+ ```bash
299
+ # fail on WARN or BLOCK
300
+ breakpoint evaluate baseline.json candidate.json --fail-on warn
301
+
302
+ # fail only on BLOCK
303
+ breakpoint evaluate baseline.json candidate.json --fail-on block
304
+ ```
305
+
306
+ Stable exit codes:
307
+ - `0` = `ALLOW`
308
+ - `1` = `WARN`
309
+ - `2` = `BLOCK`
310
+
311
+ Waivers (suppressions, Full mode):
312
+
313
+ ```bash
314
+ breakpoint evaluate baseline.json candidate.json --mode full --config policy.json --now 2026-02-15T00:00:00Z --json
315
+ ```
316
+
317
+ Config inspection:
318
+
319
+ ```bash
320
+ breakpoint config print
321
+ breakpoint config print --config custom_policy.json
322
+ breakpoint config print --config custom_policy.json --env dev
323
+ ```
324
+
325
+ ## Input Schema
326
+
327
+ Each input JSON is an object with at least:
328
+ - `output` (string)
329
+
330
+ Optional fields used by policies:
331
+ - `cost_usd` (number)
332
+ - `model` (string)
333
+ - `tokens_total` (number)
334
+ - `tokens_in` / `tokens_out` (number)
335
+ - `latency_ms` (number)
336
+
337
+ Combined input format:
338
+
339
+ ```json
340
+ {
341
+ "baseline": { "output": "..." },
342
+ "candidate": { "output": "..." }
343
+ }
344
+ ```
345
+
346
+ ## Python API
347
+
348
+ ```python
349
+ from breakpoint import evaluate
350
+
351
+ decision = evaluate(
352
+ baseline_output="hello",
353
+ candidate_output="hello there",
354
+ metadata={"baseline_tokens": 100, "candidate_tokens": 140},
355
+ )
356
+ print(decision.status)
357
+ print(decision.reasons)
358
+ ```
359
+
360
+ ## Additional Docs
361
+
362
+ - `docs/user-guide.md`
363
+ - `docs/user-guide-full-mode.md` (Full mode: config, presets, environments, waivers)
364
+ - `docs/terminal-output-lite-vs-full.md` (Lite vs Full terminal output, same format)
365
+ - `docs/quickstart-10min.md`
366
+ - `docs/install-worthy-examples.md`
367
+ - `docs/baseline-lifecycle.md`
368
+ - `docs/ci-templates.md`
369
+ - `docs/value-metrics.md`
370
+ - `docs/policy-presets.md`
371
+ - `docs/release-gate-audit.md`