invarlock 0.3.5__tar.gz → 0.3.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. {invarlock-0.3.5/src/invarlock.egg-info → invarlock-0.3.6}/PKG-INFO +6 -6
  2. {invarlock-0.3.5 → invarlock-0.3.6}/README.md +5 -5
  3. {invarlock-0.3.5 → invarlock-0.3.6}/pyproject.toml +1 -1
  4. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/__init__.py +1 -1
  5. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/_data/runtime/tiers.yaml +57 -30
  6. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/__init__.py +1 -1
  7. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/calibration/spectral_null.py +15 -10
  8. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/calibration/variance_ve.py +0 -2
  9. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/calibrate.py +6 -2
  10. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/certify.py +58 -39
  11. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/doctor.py +3 -1
  12. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/explain_gates.py +57 -8
  13. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/report.py +1 -1
  14. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/run.py +159 -61
  15. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/verify.py +78 -4
  16. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/config.py +21 -5
  17. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/api.py +45 -5
  18. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/auto_tuning.py +65 -20
  19. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/contracts.py +7 -1
  20. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/registry.py +2 -2
  21. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/runner.py +314 -50
  22. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/bench.py +0 -13
  23. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/data.py +14 -28
  24. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/metrics.py +4 -1
  25. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/primary_metric.py +23 -0
  26. invarlock-0.3.6/src/invarlock/eval/tail_stats.py +230 -0
  27. invarlock-0.3.6/src/invarlock/guards/_estimators.py +154 -0
  28. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/policies.py +16 -6
  29. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/rmt.py +625 -544
  30. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/spectral.py +348 -110
  31. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/tier_config.py +32 -30
  32. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/variance.py +5 -29
  33. invarlock-0.3.6/src/invarlock/guards_ref/rmt_ref.py +40 -0
  34. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/model_profile.py +42 -15
  35. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/certificate.py +225 -46
  36. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/certificate_schema.py +2 -1
  37. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/dataset_hashing.py +15 -2
  38. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/guards_analysis.py +197 -274
  39. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/normalizer.py +6 -0
  40. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/policy_utils.py +38 -36
  41. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/primary_metric_utils.py +71 -17
  42. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/render.py +61 -0
  43. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/report.py +1 -1
  44. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/report_types.py +5 -2
  45. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/validate.py +1 -18
  46. {invarlock-0.3.5 → invarlock-0.3.6/src/invarlock.egg-info}/PKG-INFO +6 -6
  47. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock.egg-info/SOURCES.txt +2 -0
  48. invarlock-0.3.5/src/invarlock/guards_ref/rmt_ref.py +0 -40
  49. {invarlock-0.3.5 → invarlock-0.3.6}/LICENSE +0 -0
  50. {invarlock-0.3.5 → invarlock-0.3.6}/MANIFEST.in +0 -0
  51. {invarlock-0.3.5 → invarlock-0.3.6}/setup.cfg +0 -0
  52. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/__main__.py +0 -0
  53. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/_data/runtime/profiles/ci_cpu.yaml +0 -0
  54. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/_data/runtime/profiles/release.yaml +0 -0
  55. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/_capabilities.py +0 -0
  56. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/auto.py +0 -0
  57. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/base.py +0 -0
  58. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/base_types.py +0 -0
  59. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/capabilities.py +0 -0
  60. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_bert.py +0 -0
  61. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_gpt2.py +0 -0
  62. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_llama.py +0 -0
  63. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_loading.py +0 -0
  64. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_mixin.py +0 -0
  65. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_onnx.py +0 -0
  66. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/hf_t5.py +0 -0
  67. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/adapters/py.typed +0 -0
  68. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/assurance/__init__.py +0 -0
  69. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/calibration/__init__.py +0 -0
  70. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/__init__.py +0 -0
  71. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/__main__.py +0 -0
  72. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/_evidence.py +0 -0
  73. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/_json.py +0 -0
  74. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/adapter_auto.py +0 -0
  75. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/app.py +0 -0
  76. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/__init__.py +0 -0
  77. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/export_html.py +0 -0
  78. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/commands/plugins.py +0 -0
  79. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/constants.py +0 -0
  80. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/determinism.py +0 -0
  81. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/device.py +0 -0
  82. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/doctor_helpers.py +0 -0
  83. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/errors.py +0 -0
  84. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/overhead_utils.py +0 -0
  85. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/provenance.py +0 -0
  86. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/cli/utils.py +0 -0
  87. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/config.py +0 -0
  88. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/__init__.py +0 -0
  89. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/abi.py +0 -0
  90. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/bootstrap.py +0 -0
  91. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/checkpoint.py +0 -0
  92. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/error_utils.py +0 -0
  93. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/events.py +0 -0
  94. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/exceptions.py +0 -0
  95. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/retry.py +0 -0
  96. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/core/types.py +0 -0
  97. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/__init__.py +0 -0
  98. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/_edit_utils.py +0 -0
  99. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/_external_utils.py +0 -0
  100. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/noop.py +0 -0
  101. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/py.typed +0 -0
  102. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/quant_rtn.py +0 -0
  103. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/edits/registry.py +0 -0
  104. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/__init__.py +0 -0
  105. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/bench_regression.py +0 -0
  106. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/bootstrap.py +0 -0
  107. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/probes/__init__.py +0 -0
  108. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/probes/fft.py +0 -0
  109. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/probes/mi.py +0 -0
  110. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/probes/post_attention.py +0 -0
  111. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/providers/base.py +0 -0
  112. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/providers/seq2seq.py +0 -0
  113. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/providers/text_lm.py +0 -0
  114. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/providers/vision_text.py +0 -0
  115. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/eval/py.typed +0 -0
  116. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/__init__.py +0 -0
  117. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/_contracts.py +0 -0
  118. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/invariants.py +0 -0
  119. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards/py.typed +0 -0
  120. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards_ref/__init__.py +0 -0
  121. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards_ref/spectral_ref.py +0 -0
  122. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/guards_ref/variance_ref.py +0 -0
  123. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/model_utils.py +0 -0
  124. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/__init__.py +0 -0
  125. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/alerting.py +0 -0
  126. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/core.py +0 -0
  127. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/exporters.py +0 -0
  128. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/health.py +0 -0
  129. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/metrics.py +0 -0
  130. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/py.typed +0 -0
  131. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/observability/utils.py +0 -0
  132. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/plugins/__init__.py +0 -0
  133. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/plugins/hello_guard.py +0 -0
  134. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/plugins/hf_awq_adapter.py +0 -0
  135. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/plugins/hf_bnb_adapter.py +0 -0
  136. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/plugins/hf_gptq_adapter.py +0 -0
  137. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/plugins/py.typed +0 -0
  138. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/py.typed +0 -0
  139. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/__init__.py +0 -0
  140. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/html.py +0 -0
  141. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/reporting/utils.py +0 -0
  142. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/security.py +0 -0
  143. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/sparsity_utils.py +0 -0
  144. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/utils/__init__.py +0 -0
  145. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock/utils/digest.py +0 -0
  146. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock.egg-info/dependency_links.txt +0 -0
  147. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock.egg-info/entry_points.txt +0 -0
  148. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock.egg-info/requires.txt +0 -0
  149. {invarlock-0.3.5 → invarlock-0.3.6}/src/invarlock.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: invarlock
3
- Version: 0.3.5
3
+ Version: 0.3.6
4
4
  Summary: Edit‑agnostic robustness certificates for weight edits (InvarLock framework)
5
5
  Author-email: InvarLock Team <oss@invarlock.dev>
6
6
  Maintainer-email: InvarLock Maintainers <support@invarlock.dev>
@@ -112,7 +112,7 @@ they don’t, roll back safely.
112
112
  Technical: edit‑agnostic guard pipeline (invariants → spectral → RMT →
113
113
  variance) producing a machine‑readable Safety Certificate.
114
114
 
115
- > **Status:** 0.3.5 (pre‑1.0). Until 1.0, **minor** releases may be
115
+ > **Status:** 0.3.6 (pre‑1.0). Until 1.0, **minor** releases may be
116
116
  > breaking. See CLI help and the CHANGELOG for updates.
117
117
 
118
118
  [![CI](https://img.shields.io/github/actions/workflow/status/invarlock/invarlock/ci.yml?branch=main&logo=github&label=CI)](https://github.com/invarlock/invarlock/actions/workflows/ci.yml)
@@ -170,7 +170,7 @@ Quick examples (repo presets, CPU; repo clone required for preset paths):
170
170
  pip install "invarlock[hf]"
171
171
 
172
172
  # Preflight a config (JSON diagnostics)
173
- invarlock doctor --config configs/tasks/causal_lm/ci_cpu.yaml --json
173
+ invarlock doctor --config configs/presets/causal_lm/wikitext2_512.yaml --json
174
174
 
175
175
  # Calibrated GPT‑2 small (recommended starting point; repo preset)
176
176
  INVARLOCK_ALLOW_NETWORK=1 INVARLOCK_DEDUP_TEXTS=1 \
@@ -179,7 +179,7 @@ invarlock certify \
179
179
  --subject gpt2 \
180
180
  --adapter auto \
181
181
  --profile release \
182
- --preset configs/tasks/causal_lm/release_auto.yaml
182
+ --preset configs/presets/causal_lm/wikitext2_512.yaml
183
183
 
184
184
  # Tiny causal LM smoke (out‑of‑calibration, dev‑only)
185
185
  INVARLOCK_ALLOW_NETWORK=1 \
@@ -249,7 +249,7 @@ INVARLOCK_ALLOW_NETWORK=1 invarlock certify \
249
249
  --subject gpt2 \
250
250
  --adapter auto \
251
251
  --profile ci \
252
- --preset configs/tasks/causal_lm/ci_cpu.yaml
252
+ --preset configs/presets/causal_lm/wikitext2_512.yaml
253
253
  ```
254
254
 
255
255
  - Offline/air‑gapped usage: pre‑download to a cache, then run with network
@@ -488,7 +488,7 @@ output:
488
488
  Run preflight checks before a run to catch misconfigurations early:
489
489
 
490
490
  ```bash
491
- invarlock doctor --config configs/tasks/causal_lm/ci_cpu.yaml --json
491
+ invarlock doctor --config configs/presets/causal_lm/wikitext2_512.yaml --json
492
492
  ```
493
493
 
494
494
  Text mode emits lines prefixed with `ERROR:`, `WARNING:`, or `NOTE:` and stable
@@ -6,7 +6,7 @@ they don’t, roll back safely.
6
6
  Technical: edit‑agnostic guard pipeline (invariants → spectral → RMT →
7
7
  variance) producing a machine‑readable Safety Certificate.
8
8
 
9
- > **Status:** 0.3.5 (pre‑1.0). Until 1.0, **minor** releases may be
9
+ > **Status:** 0.3.6 (pre‑1.0). Until 1.0, **minor** releases may be
10
10
  > breaking. See CLI help and the CHANGELOG for updates.
11
11
 
12
12
  [![CI](https://img.shields.io/github/actions/workflow/status/invarlock/invarlock/ci.yml?branch=main&logo=github&label=CI)](https://github.com/invarlock/invarlock/actions/workflows/ci.yml)
@@ -64,7 +64,7 @@ Quick examples (repo presets, CPU; repo clone required for preset paths):
64
64
  pip install "invarlock[hf]"
65
65
 
66
66
  # Preflight a config (JSON diagnostics)
67
- invarlock doctor --config configs/tasks/causal_lm/ci_cpu.yaml --json
67
+ invarlock doctor --config configs/presets/causal_lm/wikitext2_512.yaml --json
68
68
 
69
69
  # Calibrated GPT‑2 small (recommended starting point; repo preset)
70
70
  INVARLOCK_ALLOW_NETWORK=1 INVARLOCK_DEDUP_TEXTS=1 \
@@ -73,7 +73,7 @@ invarlock certify \
73
73
  --subject gpt2 \
74
74
  --adapter auto \
75
75
  --profile release \
76
- --preset configs/tasks/causal_lm/release_auto.yaml
76
+ --preset configs/presets/causal_lm/wikitext2_512.yaml
77
77
 
78
78
  # Tiny causal LM smoke (out‑of‑calibration, dev‑only)
79
79
  INVARLOCK_ALLOW_NETWORK=1 \
@@ -143,7 +143,7 @@ INVARLOCK_ALLOW_NETWORK=1 invarlock certify \
143
143
  --subject gpt2 \
144
144
  --adapter auto \
145
145
  --profile ci \
146
- --preset configs/tasks/causal_lm/ci_cpu.yaml
146
+ --preset configs/presets/causal_lm/wikitext2_512.yaml
147
147
  ```
148
148
 
149
149
  - Offline/air‑gapped usage: pre‑download to a cache, then run with network
@@ -382,7 +382,7 @@ output:
382
382
  Run preflight checks before a run to catch misconfigurations early:
383
383
 
384
384
  ```bash
385
- invarlock doctor --config configs/tasks/causal_lm/ci_cpu.yaml --json
385
+ invarlock doctor --config configs/presets/causal_lm/wikitext2_512.yaml --json
386
386
  ```
387
387
 
388
388
  Text mode emits lines prefixed with `ERROR:`, `WARNING:`, or `NOTE:` and stable
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "invarlock"
7
- version = "0.3.5"
7
+ version = "0.3.6"
8
8
  description = "Edit‑agnostic robustness certificates for weight edits (InvarLock framework)"
9
9
  authors = [{ name = "InvarLock Team", email = "oss@invarlock.dev" }]
10
10
  maintainers = [{ name = "InvarLock Maintainers", email = "support@invarlock.dev" }]
@@ -12,7 +12,7 @@ For torch-dependent functionality, see subpackages under `invarlock.*`:
12
12
  - `invarlock.eval`: Metrics, guard-overhead checks, and certification
13
13
  """
14
14
 
15
- __version__ = "0.3.5"
15
+ __version__ = "0.3.6"
16
16
 
17
17
  # Core exports - torch-independent
18
18
  from .config import CFG, Defaults, get_default_config
@@ -1,8 +1,12 @@
1
- # Tier guard policy knobs for variance correction (balanced vs conservative)
1
+ # Tier policy defaults (metrics gates + guard knobs) used at runtime.
2
2
  #
3
- # These values mirror the settings validated during the December 2025
4
- # calibration runs. They should be kept in sync with the policy digest
5
- # embedded in certificates and referenced by automation documentation.
3
+ # Balanced and Conservative values are calibrated/validated against pilot/null
4
+ # runs (Nov/Dec 2025) where applicable; Aggressive is research-oriented (not in
5
+ # the safety case).
6
+ #
7
+ # Rationale by key: docs/reference/tier-policy-catalog.md
8
+ # Calibration method: docs/assurance/09-tier-v1-calibration.md
9
+ # Provenance/digest: docs/assurance/11-policy-provenance.md
6
10
 
7
11
  balanced:
8
12
  metrics:
@@ -11,6 +15,13 @@ balanced:
11
15
  min_tokens: 50000
12
16
  hysteresis_ratio: 0.002
13
17
  min_token_fraction: 0.01
18
+ pm_tail:
19
+ mode: warn
20
+ min_windows: 50
21
+ quantile: 0.95
22
+ quantile_max: 0.20
23
+ epsilon: 0.0001
24
+ mass_max: 1.0
14
25
  accuracy:
15
26
  delta_min_pp: -1.0
16
27
  min_examples: 200
@@ -20,7 +31,7 @@ balanced:
20
31
  deadband: 0.02
21
32
  min_abs_adjust: 0.012
22
33
  max_scale_step: 0.03
23
- min_effect_lognll: 0.0009
34
+ min_effect_lognll: 0.0
24
35
  predictive_one_sided: true
25
36
  topk_backstop: 1
26
37
  max_adjusted_modules: 1
@@ -33,10 +44,10 @@ balanced:
33
44
  max_caps: 5
34
45
  max_spectral_norm: null
35
46
  family_caps:
36
- ffn: 3.834
37
- attn: 3.423
38
- embed: 3.1
39
- other: 3.1
47
+ ffn: 3.849
48
+ attn: 3.018
49
+ embed: 1.05
50
+ other: 0.0
40
51
  multiple_testing:
41
52
  method: bh
42
53
  alpha: 0.05
@@ -44,12 +55,12 @@ balanced:
44
55
  rmt_guard:
45
56
  deadband: 0.10
46
57
  margin: 1.5
47
- epsilon_default: 0.10
58
+ epsilon_default: 0.01
48
59
  epsilon_by_family:
49
- ffn: 0.10
50
- attn: 0.08
51
- embed: 0.12
52
- other: 0.12
60
+ ffn: 0.01
61
+ attn: 0.01
62
+ embed: 0.01
63
+ other: 0.01
53
64
 
54
65
  conservative:
55
66
  metrics:
@@ -58,6 +69,13 @@ conservative:
58
69
  min_tokens: 20000
59
70
  hysteresis_ratio: 0.002
60
71
  min_token_fraction: 0.01
72
+ pm_tail:
73
+ mode: warn
74
+ min_windows: 50
75
+ quantile: 0.95
76
+ quantile_max: 0.12
77
+ epsilon: 0.0001
78
+ mass_max: 1.0
61
79
  accuracy:
62
80
  delta_min_pp: -0.5
63
81
  min_examples: 200
@@ -67,7 +85,7 @@ conservative:
67
85
  deadband: 0.03
68
86
  min_abs_adjust: 0.02
69
87
  max_scale_step: 0.015
70
- min_effect_lognll: 0.0018
88
+ min_effect_lognll: 0.016
71
89
  predictive_one_sided: false
72
90
  topk_backstop: 0
73
91
  max_adjusted_modules: 0
@@ -78,24 +96,25 @@ conservative:
78
96
  deadband: 0.05
79
97
  scope: ffn
80
98
  max_caps: 3
99
+ max_spectral_norm: null
81
100
  family_caps:
82
- ffn: 2.3
101
+ ffn: 3.849
83
102
  attn: 2.6
84
103
  embed: 2.8
85
104
  other: 2.8
86
105
  multiple_testing:
87
106
  method: bonferroni
88
- alpha: 0.02
107
+ alpha: 0.000625
89
108
  m: 4
90
109
  rmt_guard:
91
110
  deadband: 0.05
92
111
  margin: 1.3
93
- epsilon_default: 0.06
112
+ epsilon_default: 0.01
94
113
  epsilon_by_family:
95
- ffn: 0.06
96
- attn: 0.05
97
- embed: 0.07
98
- other: 0.07
114
+ ffn: 0.01
115
+ attn: 0.01
116
+ embed: 0.01
117
+ other: 0.01
99
118
 
100
119
  aggressive:
101
120
  metrics:
@@ -104,6 +123,13 @@ aggressive:
104
123
  min_tokens: 50000
105
124
  hysteresis_ratio: 0.002
106
125
  min_token_fraction: 0.01
126
+ pm_tail:
127
+ mode: warn
128
+ min_windows: 50
129
+ quantile: 0.95
130
+ quantile_max: 0.30
131
+ epsilon: 0.0001
132
+ mass_max: 1.0
107
133
  accuracy:
108
134
  delta_min_pp: -2.0
109
135
  min_examples: 200
@@ -111,27 +137,28 @@ aggressive:
111
137
  min_examples_fraction: 0.01
112
138
  variance_guard:
113
139
  deadband: 0.12
114
- min_effect_lognll: 0.0005
140
+ min_effect_lognll: 0.033
115
141
  spectral_guard:
116
142
  sigma_quantile: 0.98
117
143
  deadband: 0.15
118
144
  scope: ffn
119
145
  max_caps: 8
146
+ max_spectral_norm: null
120
147
  family_caps:
121
- ffn: 3.0
148
+ ffn: 3.849
122
149
  attn: 3.5
123
150
  embed: 2.5
124
151
  other: 3.5
125
152
  multiple_testing:
126
153
  method: bh
127
- alpha: 0.1
154
+ alpha: 0.00078125
128
155
  m: 4
129
156
  rmt_guard:
130
157
  deadband: 0.15
131
158
  margin: 1.8
132
- epsilon_default: 0.15
159
+ epsilon_default: 0.01
133
160
  epsilon_by_family:
134
- ffn: 0.15
135
- attn: 0.15
136
- embed: 0.15
137
- other: 0.15
161
+ ffn: 0.01
162
+ attn: 0.01
163
+ embed: 0.01
164
+ other: 0.01
@@ -76,7 +76,7 @@ class _RemovedComponent:
76
76
  return _RemovedComponent(self._name, self._replacement)
77
77
 
78
78
 
79
- # Placeholders for removed/legacy utilities referenced in tests
79
+ # Placeholders for removed utilities referenced in tests
80
80
  HF_Pythia_Adapter = _RemovedComponent("HF_Pythia_Adapter")
81
81
  auto_tune_pruning_budget = _RemovedComponent("auto_tune_pruning_budget")
82
82
  run_auto_invarlock = _RemovedComponent("run_auto_invarlock")
@@ -148,7 +148,7 @@ def _selected_families_for_alpha(
148
148
 
149
149
 
150
150
  def summarize_null_sweep_reports(
151
- reports: list[dict[str, Any]],
151
+ reports: list[object],
152
152
  *,
153
153
  tier: str,
154
154
  safety_margin: float = 0.05,
@@ -186,20 +186,25 @@ def summarize_null_sweep_reports(
186
186
  mt = _extract_multiple_testing(metrics)
187
187
  if mt:
188
188
  mt_method = str(mt.get("method", mt_method))
189
- if mt.get("alpha") is not None:
190
- mt_alpha = float(mt.get("alpha"))
191
- if mt.get("m") is not None:
192
- mt_m = int(mt.get("m"))
189
+ alpha_value = mt.get("alpha")
190
+ if alpha_value is not None:
191
+ try:
192
+ mt_alpha = float(alpha_value)
193
+ except Exception:
194
+ pass
195
+ m_value = mt.get("m")
196
+ if m_value is not None:
197
+ try:
198
+ mt_m = int(m_value)
199
+ except Exception:
200
+ pass
193
201
 
194
202
  fam_z = _extract_family_max_z(metrics)
195
203
  for fam, z in fam_z.items():
196
204
  family_max_z[fam] = max(family_max_z[fam], float(z))
197
205
 
198
- selection = (
199
- metrics.get("multiple_testing_selection")
200
- if isinstance(metrics.get("multiple_testing_selection"), dict)
201
- else {}
202
- )
206
+ raw_selection = metrics.get("multiple_testing_selection")
207
+ selection = raw_selection if isinstance(raw_selection, dict) else {}
203
208
  pvals = selection.get("family_pvalues")
204
209
  if not isinstance(pvals, dict):
205
210
  pvals = {}
@@ -107,8 +107,6 @@ def summarize_ve_sweep_reports(
107
107
  evaluated = 0
108
108
 
109
109
  for report in reports:
110
- if not isinstance(report, dict):
111
- continue
112
110
  g = _extract_guard(report, "variance") or {}
113
111
  metrics = g.get("metrics", {}) if isinstance(g.get("metrics"), dict) else {}
114
112
  pg = metrics.get("predictive_gate")
@@ -144,7 +144,9 @@ def null_sweep(
144
144
  ),
145
145
  n_seeds: int = typer.Option(10, "--n-seeds", min=1, help="Number of seeds to run."),
146
146
  seed_start: int = typer.Option(42, "--seed-start", help="Starting seed."),
147
- profile: str = typer.Option("ci", "--profile", help="Run profile (ci|release)."),
147
+ profile: str = typer.Option(
148
+ "ci", "--profile", help="Run profile (ci|release|ci_cpu|dev)."
149
+ ),
148
150
  device: str | None = typer.Option(None, "--device", help="Device override."),
149
151
  safety_margin: float = typer.Option(
150
152
  0.05, "--safety-margin", help="Safety margin applied to κ recommendations."
@@ -363,7 +365,9 @@ def ve_sweep(
363
365
  "--target-enable-rate",
364
366
  help="Target expected VE enable rate (predictive-gate lower bound).",
365
367
  ),
366
- profile: str = typer.Option("ci", "--profile", help="Run profile (ci|release)."),
368
+ profile: str = typer.Option(
369
+ "ci", "--profile", help="Run profile (ci|release|ci_cpu|dev)."
370
+ ),
367
371
  device: str | None = typer.Option(None, "--device", help="Device override."),
368
372
  safety_margin: float = typer.Option(
369
373
  0.0,
@@ -22,9 +22,9 @@ from typing import Any
22
22
  import typer
23
23
  from rich.console import Console
24
24
 
25
+ from ...core.exceptions import MetricsError
25
26
  from ..adapter_auto import resolve_auto_adapter
26
27
  from ..config import _deep_merge as _merge # reuse helper
27
- from ..errors import InvarlockError
28
28
 
29
29
  # Use the report group's programmatic entry for report generation
30
30
  from .report import report_command as _report
@@ -98,7 +98,9 @@ def certify_command(
98
98
  "--device",
99
99
  help="Device override for runs (auto|cuda|mps|cpu)",
100
100
  ),
101
- profile: str = typer.Option("ci", "--profile", help="Profile (ci|release)"),
101
+ profile: str = typer.Option(
102
+ "ci", "--profile", help="Profile (ci|release|ci_cpu|dev)"
103
+ ),
102
104
  tier: str = typer.Option("balanced", "--tier", help="Tier label for context"),
103
105
  preset: str | None = typer.Option(
104
106
  None,
@@ -152,9 +154,9 @@ def certify_command(
152
154
  # scenario), fall back to a minimal built-in universal preset so the
153
155
  # flag-only quick start works without cloning the repo.
154
156
  default_universal = (
155
- Path("configs/tasks/masked_lm/ci_cpu.yaml")
157
+ Path("configs/presets/masked_lm/wikitext2_128.yaml")
156
158
  if eff_adapter == "hf_bert"
157
- else Path("configs/tasks/causal_lm/ci_cpu.yaml")
159
+ else Path("configs/presets/causal_lm/wikitext2_512.yaml")
158
160
  )
159
161
  preset_path = Path(preset) if preset is not None else default_universal
160
162
 
@@ -185,6 +187,20 @@ def certify_command(
185
187
  model_block.pop("device", None)
186
188
  preset_data["model"] = model_block
187
189
 
190
+ default_guards_order = ["invariants", "spectral", "rmt", "variance", "invariants"]
191
+ guards_order = None
192
+ preset_guards = preset_data.get("guards")
193
+ if isinstance(preset_guards, dict):
194
+ preset_order = preset_guards.get("order")
195
+ if (
196
+ isinstance(preset_order, list)
197
+ and preset_order
198
+ and all(isinstance(item, str) for item in preset_order)
199
+ ):
200
+ guards_order = list(preset_order)
201
+ if guards_order is None:
202
+ guards_order = list(default_guards_order)
203
+
188
204
  # Create temp baseline config (no-op edit)
189
205
  # Normalize possible "hf:" prefixes for HF adapters
190
206
  norm_src_id = _normalize_model_id(src_id, eff_adapter)
@@ -199,9 +215,7 @@ def certify_command(
199
215
  },
200
216
  "edit": {"name": "noop", "plan": {}},
201
217
  "eval": {},
202
- "guards": {
203
- "order": ["invariants", "spectral", "rmt", "variance", "invariants"]
204
- },
218
+ "guards": {"order": guards_order},
205
219
  "output": {"dir": str(Path(out) / "source")},
206
220
  "context": {"profile": profile, "tier": tier},
207
221
  },
@@ -292,15 +306,7 @@ def certify_command(
292
306
  "model": {"id": norm_edt_id, "adapter": eff_adapter},
293
307
  "edit": {"name": "noop", "plan": {}},
294
308
  "eval": {},
295
- "guards": {
296
- "order": [
297
- "invariants",
298
- "spectral",
299
- "rmt",
300
- "variance",
301
- "invariants",
302
- ]
303
- },
309
+ "guards": {"order": guards_order},
304
310
  "output": {"dir": str(Path(out) / "edited")},
305
311
  "context": {"profile": profile, "tier": tier},
306
312
  },
@@ -325,12 +331,11 @@ def certify_command(
325
331
  raise typer.Exit(1)
326
332
 
327
333
  # CI/Release hard‑abort: fail fast when primary metric is not computable.
328
- # Fall back to legacy ppl_* keys when primary_metric block is absent.
329
334
  try:
330
335
  prof = str(profile or "").strip().lower()
331
336
  except Exception:
332
337
  prof = ""
333
- if prof in {"ci", "release"}:
338
+ if prof in {"ci", "ci_cpu", "release"}:
334
339
  try:
335
340
  with Path(edited_report).open("r", encoding="utf-8") as fh:
336
341
  edited_payload = json.load(fh)
@@ -364,35 +369,49 @@ def certify_command(
364
369
  else None
365
370
  ) or "unknown"
366
371
 
367
- # Enforce only when a metric block is present; skip for minimal stub reports
368
- # Enforce only when a primary_metric block is present
372
+ # Enforce only when a primary_metric block is present; allow degraded-but-flagged metrics to emit certificates, but fail the task.
369
373
  has_metric_block = isinstance(pm, dict) and bool(pm)
370
374
  if has_metric_block:
371
- # Treat non‑finite PM as hard error in CI/Release (after legacy fallback).
372
- # Require a finite final value; preview is optional for legacy reports.
373
- if not _finite(pm_final):
374
- err = InvarlockError(
375
+ degraded = bool(pm.get("invalid") or pm.get("degraded"))
376
+ if degraded or not _finite(pm_final):
377
+ fallback = pm_prev if _finite(pm_prev) else pm_final
378
+ if not _finite(fallback) or fallback <= 0:
379
+ fallback = 1.0
380
+ degraded_reason = pm.get("degraded_reason") or (
381
+ "non_finite_pm"
382
+ if (not _finite(pm_prev) or not _finite(pm_final))
383
+ else "primary_metric_degraded"
384
+ )
385
+ console.print(
386
+ "[yellow]⚠️ Primary metric degraded or non-finite; emitting certificate and marking task degraded. Primary metric computation failed.[/yellow]"
387
+ )
388
+ pm["degraded"] = True
389
+ pm["invalid"] = pm.get("invalid") or True
390
+ pm["preview"] = pm_prev if _finite(pm_prev) else fallback
391
+ pm["final"] = pm_final if _finite(pm_final) else fallback
392
+ pm["ratio_vs_baseline"] = pm_ratio if _finite(pm_ratio) else 1.0
393
+ pm["degraded_reason"] = degraded_reason
394
+ metrics["primary_metric"] = pm
395
+ edited_payload.setdefault("metrics", {}).update(metrics)
396
+
397
+ # Emit the certificate for inspection, then exit with a CI-visible error.
398
+ _report(
399
+ run=str(edited_report),
400
+ format="cert",
401
+ baseline=str(baseline_report),
402
+ output=cert_out,
403
+ )
404
+ err = MetricsError(
375
405
  code="E111",
376
- message=(
377
- "Primary metric computation failed (NaN/inf). "
378
- f"Context: device={device}, adapter={adapter_name}, edit={edit_name}. "
379
- "Baseline ok; edited failed to compute ppl. "
380
- "Try: use an accelerator (mps/cuda), force float32, reduce max_modules, "
381
- "or lower the evaluation batch size."
382
- ),
406
+ message=f"Primary metric degraded or non-finite ({degraded_reason}).",
383
407
  details={
384
- "device": device,
408
+ "reason": degraded_reason,
385
409
  "adapter": adapter_name,
410
+ "device": device,
386
411
  "edit": edit_name,
387
- "pm_preview": pm_prev,
388
- "pm_final": pm_final,
389
- "pm_ratio": pm_ratio,
390
412
  },
391
413
  )
392
- code = _resolve_exit_code(err, profile=prof)
393
- console.print(f"[red]{err}[/red]")
394
- # Do not emit a certificate
395
- raise typer.Exit(code)
414
+ raise typer.Exit(_resolve_exit_code(err, profile=profile))
396
415
 
397
416
  console.print("📜 Emitting certificate")
398
417
  _report(
@@ -188,7 +188,9 @@ def doctor_command(
188
188
  None, "--config", "-c", help="Path to YAML config for preflight lints"
189
189
  ),
190
190
  profile: str | None = typer.Option(
191
- None, "--profile", help="Profile to apply for preflight (ci|release)"
191
+ None,
192
+ "--profile",
193
+ help="Profile to apply for preflight (e.g. ci, release, ci_cpu; dev is a no-op)",
192
194
  ),
193
195
  baseline: str | None = typer.Option(
194
196
  None, "--baseline", help="Optional baseline report to check pairing readiness"
@@ -99,10 +99,6 @@ def explain_gates_command(
99
99
  pm = cert.get("primary_metric", {})
100
100
  ratio = pm.get("ratio_vs_baseline")
101
101
  ratio_ci = pm.get("display_ci")
102
- elif isinstance(cert.get("ppl"), dict): # legacy
103
- ppl = cert.get("ppl", {})
104
- ratio = ppl.get("ratio_vs_baseline")
105
- ratio_ci = ppl.get("ratio_ci")
106
102
  hysteresis_applied = bool(validation.get("hysteresis_applied"))
107
103
  status = "PASS" if bool(validation.get("primary_metric_acceptable")) else "FAIL"
108
104
  console.print("[bold]Gate: Primary Metric vs Baseline[/bold]")
@@ -125,6 +121,63 @@ def explain_gates_command(
125
121
  f" note: hysteresis applied → effective threshold = {limit_with_hyst:.3f}x"
126
122
  )
127
123
 
124
+ # Tail gate explanation (warn/fail; based on per-window Δlog-loss vs baseline)
125
+ pm_tail = (
126
+ cert.get("primary_metric_tail", {})
127
+ if isinstance(cert.get("primary_metric_tail"), dict)
128
+ else {}
129
+ )
130
+ if pm_tail:
131
+ mode = str(pm_tail.get("mode", "warn") or "warn").strip().lower()
132
+ evaluated = bool(pm_tail.get("evaluated", False))
133
+ passed = bool(pm_tail.get("passed", True))
134
+ policy = (
135
+ pm_tail.get("policy", {}) if isinstance(pm_tail.get("policy"), dict) else {}
136
+ )
137
+ stats = (
138
+ pm_tail.get("stats", {}) if isinstance(pm_tail.get("stats"), dict) else {}
139
+ )
140
+
141
+ q = policy.get("quantile", 0.95)
142
+ try:
143
+ qf = float(q)
144
+ except Exception:
145
+ qf = 0.95
146
+ qf = max(0.0, min(1.0, qf))
147
+ q_key = f"q{int(round(100.0 * qf))}"
148
+ q_name = f"P{int(round(100.0 * qf))}"
149
+ q_val = stats.get(q_key)
150
+ qmax = policy.get("quantile_max")
151
+ eps = policy.get("epsilon", stats.get("epsilon"))
152
+ mass = stats.get("tail_mass")
153
+ mmax = policy.get("mass_max")
154
+
155
+ if not evaluated:
156
+ status_tail = "INFO"
157
+ elif passed:
158
+ status_tail = "PASS"
159
+ elif mode == "fail":
160
+ status_tail = "FAIL"
161
+ else:
162
+ status_tail = "WARN"
163
+
164
+ console.print("\n[bold]Gate: Primary Metric Tail (ΔlogNLL)[/bold]")
165
+ console.print(f" mode: {mode}")
166
+ console.print(f" status: {status_tail}")
167
+ if isinstance(q_val, int | float):
168
+ console.print(f" observed: {q_name}={float(q_val):.4f}")
169
+ if isinstance(mass, int | float):
170
+ console.print(f" tail_mass: Pr[ΔlogNLL > ε]={float(mass):.4f}")
171
+ thr_parts: list[str] = []
172
+ if isinstance(qmax, int | float):
173
+ thr_parts.append(f"{q_name}≤{float(qmax):.4f}")
174
+ if isinstance(mmax, int | float):
175
+ thr_parts.append(f"mass≤{float(mmax):.4f}")
176
+ if isinstance(eps, int | float):
177
+ thr_parts.append(f"ε={float(eps):.1e}")
178
+ if thr_parts:
179
+ console.print(" threshold: " + "; ".join(thr_parts))
180
+
128
181
  # Dataset split visibility from report provenance
129
182
  try:
130
183
  split = (report_data.get("provenance", {}) or {}).get("dataset_split")
@@ -151,10 +204,6 @@ def explain_gates_command(
151
204
  drift = float(final) / float(preview)
152
205
  except Exception:
153
206
  drift = None
154
- if isinstance(cert.get("ppl"), dict): # legacy
155
- ppl = cert.get("ppl", {})
156
- drift = ppl.get("preview_final_ratio", drift)
157
- drift_ci = ppl.get("drift_ci")
158
207
  drift_status = (
159
208
  "PASS" if bool(validation.get("preview_final_drift_acceptable")) else "FAIL"
160
209
  )
@@ -120,7 +120,7 @@ def _generate_reports(
120
120
  else:
121
121
  console.print(f" 📄 {fmt.upper()}: {file_path}")
122
122
 
123
- # Show key metrics (PM-first). Avoid legacy PPL wording.
123
+ # Show key metrics (PM-first). Avoid PPL-first wording.
124
124
  console.print("\n📈 Key Metrics:")
125
125
  console.print(f" Model: {primary_report['meta']['model_id']}")
126
126
  console.print(f" Edit: {primary_report['edit']['name']}")