invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. invarlock/__init__.py +4 -4
  2. invarlock/adapters/__init__.py +10 -14
  3. invarlock/adapters/auto.py +37 -50
  4. invarlock/adapters/capabilities.py +2 -2
  5. invarlock/adapters/hf_causal.py +418 -0
  6. invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
  7. invarlock/adapters/hf_loading.py +7 -7
  8. invarlock/adapters/hf_mixin.py +53 -9
  9. invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
  10. invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
  11. invarlock/assurance/__init__.py +15 -23
  12. invarlock/cli/adapter_auto.py +32 -26
  13. invarlock/cli/app.py +128 -27
  14. invarlock/cli/commands/__init__.py +2 -2
  15. invarlock/cli/commands/calibrate.py +48 -4
  16. invarlock/cli/commands/doctor.py +8 -10
  17. invarlock/cli/commands/evaluate.py +986 -0
  18. invarlock/cli/commands/explain_gates.py +25 -17
  19. invarlock/cli/commands/export_html.py +11 -9
  20. invarlock/cli/commands/plugins.py +13 -9
  21. invarlock/cli/commands/report.py +326 -92
  22. invarlock/cli/commands/run.py +1160 -228
  23. invarlock/cli/commands/verify.py +157 -97
  24. invarlock/cli/config.py +1 -1
  25. invarlock/cli/determinism.py +1 -1
  26. invarlock/cli/doctor_helpers.py +4 -5
  27. invarlock/cli/output.py +193 -0
  28. invarlock/cli/provenance.py +4 -4
  29. invarlock/core/bootstrap.py +1 -1
  30. invarlock/core/registry.py +9 -11
  31. invarlock/core/retry.py +14 -14
  32. invarlock/core/runner.py +112 -26
  33. invarlock/edits/noop.py +2 -2
  34. invarlock/edits/quant_rtn.py +67 -39
  35. invarlock/eval/__init__.py +1 -1
  36. invarlock/eval/bench.py +14 -10
  37. invarlock/eval/data.py +68 -23
  38. invarlock/eval/metrics.py +59 -1
  39. invarlock/eval/primary_metric.py +1 -1
  40. invarlock/eval/tasks/__init__.py +12 -0
  41. invarlock/eval/tasks/classification.py +48 -0
  42. invarlock/eval/tasks/qa.py +36 -0
  43. invarlock/eval/tasks/text_generation.py +102 -0
  44. invarlock/guards/invariants.py +19 -10
  45. invarlock/guards/rmt.py +2 -2
  46. invarlock/guards/spectral.py +1 -1
  47. invarlock/guards/variance.py +2 -2
  48. invarlock/model_profile.py +64 -62
  49. invarlock/observability/health.py +6 -6
  50. invarlock/observability/metrics.py +108 -0
  51. invarlock/plugins/hf_bnb_adapter.py +32 -21
  52. invarlock/reporting/__init__.py +18 -4
  53. invarlock/reporting/guards_analysis.py +154 -4
  54. invarlock/reporting/html.py +61 -11
  55. invarlock/reporting/normalizer.py +9 -2
  56. invarlock/reporting/policy_utils.py +1 -1
  57. invarlock/reporting/primary_metric_utils.py +11 -11
  58. invarlock/reporting/render.py +876 -510
  59. invarlock/reporting/report.py +72 -30
  60. invarlock/reporting/{certificate.py → report_builder.py} +252 -99
  61. invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
  62. invarlock/reporting/report_types.py +6 -1
  63. invarlock/reporting/telemetry.py +86 -0
  64. invarlock-0.3.8.dist-info/METADATA +283 -0
  65. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
  66. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
  67. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
  68. invarlock/adapters/hf_gpt2.py +0 -404
  69. invarlock/adapters/hf_llama.py +0 -487
  70. invarlock/cli/commands/certify.py +0 -422
  71. invarlock-0.3.6.dist-info/METADATA +0 -588
  72. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
  73. {invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0
invarlock/cli/app.py CHANGED
@@ -17,7 +17,11 @@ import typer
17
17
  from rich.console import Console
18
18
  from typer.core import TyperGroup
19
19
 
20
- from invarlock.security import enforce_default_security
20
+ from invarlock.security import (
21
+ enforce_default_security,
22
+ enforce_network_policy,
23
+ network_policy_allows,
24
+ )
21
25
 
22
26
  # Lightweight import mode disables heavy side effects in some modules, but we no
23
27
  # longer force plugin discovery off globally here; individual commands may gate
@@ -33,7 +37,7 @@ LIGHT_IMPORT = os.getenv("INVARLOCK_LIGHT_IMPORT", "").strip().lower() in {
33
37
  class OrderedGroup(TyperGroup):
34
38
  def list_commands(self, ctx): # type: ignore[override]
35
39
  return [
36
- "certify",
40
+ "evaluate",
37
41
  "calibrate",
38
42
  "report",
39
43
  "verify",
@@ -48,8 +52,8 @@ class OrderedGroup(TyperGroup):
48
52
  app = typer.Typer(
49
53
  name="invarlock",
50
54
  help=(
51
- "InvarLock — certify model changes with deterministic pairing and safety gates.\n"
52
- "Quick path: invarlock certify --baseline <MODEL> --subject <MODEL>\n"
55
+ "InvarLock — evaluate model changes with deterministic pairing and safety gates.\n"
56
+ "Quick path: invarlock evaluate --baseline <MODEL> --subject <MODEL>\n"
53
57
  "Hint: use --edit-config to run the built-in quant_rtn demo.\n"
54
58
  "Tip: enable downloads with INVARLOCK_ALLOW_NETWORK=1 when fetching.\n"
55
59
  "Exit codes:\n"
@@ -65,17 +69,16 @@ app = typer.Typer(
65
69
  console = Console()
66
70
 
67
71
 
68
- @app.command()
69
- def version():
70
- """Show InvarLock version."""
72
+ def _emit_version() -> None:
73
+ """Emit the InvarLock version string."""
71
74
  # Prefer package metadata when available so CLI reflects wheel truth
72
75
  try:
73
76
  from importlib.metadata import version as _pkg_version
74
77
 
75
78
  schema = None
76
79
  try:
77
- from invarlock.reporting.certificate import (
78
- CERTIFICATE_SCHEMA_VERSION as _SCHEMA,
80
+ from invarlock.reporting.report_builder import (
81
+ REPORT_SCHEMA_VERSION as _SCHEMA,
79
82
  )
80
83
 
81
84
  schema = _SCHEMA
@@ -96,26 +99,59 @@ def version():
96
99
  console.print("InvarLock version unknown")
97
100
 
98
101
 
102
+ @app.callback(invoke_without_command=True)
103
+ def _root(
104
+ ctx: typer.Context,
105
+ show_version: bool = typer.Option(
106
+ False,
107
+ "--version",
108
+ "-V",
109
+ help="Show version and exit.",
110
+ is_eager=True,
111
+ ),
112
+ ) -> None:
113
+ was_allowed = network_policy_allows()
114
+ enforce_default_security()
115
+ ctx.call_on_close(lambda: enforce_network_policy(was_allowed))
116
+ if show_version:
117
+ _emit_version()
118
+ raise typer.Exit()
119
+
120
+
121
+ @app.command()
122
+ def version():
123
+ """Show InvarLock version."""
124
+ _emit_version()
125
+
126
+
99
127
  """Register command modules and groups in the desired help order.
100
128
 
101
- Order: certify → report → run → plugins → doctor → version
129
+ Order: evaluate → report → run → plugins → doctor → version
102
130
  """
103
131
 
104
132
 
105
133
  @app.command(
106
- name="certify",
134
+ name="evaluate",
107
135
  help=(
108
- "Certify a subject model against a baseline and generate a safety certificate. "
136
+ "Evaluate a subject model against a baseline and generate an evaluation report. "
109
137
  "Use when you have two model snapshots and want pass/fail gating."
110
138
  ),
111
139
  )
112
- def _certify_lazy(
140
+ def _evaluate_lazy(
113
141
  source: str = typer.Option(
114
142
  ..., "--source", "--baseline", help="Baseline model dir or Hub ID"
115
143
  ),
116
144
  edited: str = typer.Option(
117
145
  ..., "--edited", "--subject", help="Subject model dir or Hub ID"
118
146
  ),
147
+ baseline_report: str | None = typer.Option(
148
+ None,
149
+ "--baseline-report",
150
+ help=(
151
+ "Reuse an existing baseline run report.json (skips baseline evaluation). "
152
+ "Must include stored evaluation windows (e.g., set INVARLOCK_STORE_EVAL_WINDOWS=1)."
153
+ ),
154
+ ),
119
155
  adapter: str = typer.Option(
120
156
  "auto", "--adapter", help="Adapter name or 'auto' to resolve"
121
157
  ),
@@ -133,26 +169,60 @@ def _certify_lazy(
133
169
  ),
134
170
  ),
135
171
  out: str = typer.Option("runs", "--out", help="Base output directory"),
136
- cert_out: str = typer.Option(
137
- "reports/cert", "--cert-out", help="Certificate output directory"
172
+ report_out: str = typer.Option(
173
+ "reports/eval", "--report-out", help="Evaluation report output directory"
138
174
  ),
139
175
  edit_config: str | None = typer.Option(
140
176
  None, "--edit-config", help="Edit preset to apply a demo edit (quant_rtn)"
141
177
  ),
178
+ edit_label: str | None = typer.Option(
179
+ None,
180
+ "--edit-label",
181
+ help=(
182
+ "Edit algorithm label for BYOE models. Use 'noop' for baseline, "
183
+ "'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
184
+ ),
185
+ ),
186
+ quiet: bool = typer.Option(
187
+ False, "--quiet", "-q", help="Minimal output (suppress run/report detail)"
188
+ ),
189
+ verbose: bool = typer.Option(
190
+ False, "--verbose", "-v", help="Verbose output (include debug details)"
191
+ ),
192
+ banner: bool = typer.Option(
193
+ True, "--banner/--no-banner", help="Show header banner"
194
+ ),
195
+ style: str = typer.Option("audit", "--style", help="Output style (audit|friendly)"),
196
+ timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
197
+ progress: bool = typer.Option(
198
+ True, "--progress/--no-progress", help="Show progress done messages"
199
+ ),
200
+ no_color: bool = typer.Option(
201
+ False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
202
+ ),
142
203
  ):
143
- from .commands.certify import certify_command as _cert
204
+ from .commands.evaluate import evaluate_command as _eval
144
205
 
145
- return _cert(
206
+ return _eval(
146
207
  source=source,
147
208
  edited=edited,
209
+ baseline_report=baseline_report,
148
210
  adapter=adapter,
149
211
  device=device,
150
212
  profile=profile,
151
213
  tier=tier,
152
214
  preset=preset,
153
215
  out=out,
154
- cert_out=cert_out,
216
+ report_out=report_out,
155
217
  edit_config=edit_config,
218
+ edit_label=edit_label,
219
+ quiet=quiet,
220
+ verbose=verbose,
221
+ banner=banner,
222
+ style=style,
223
+ timing=timing,
224
+ progress=progress,
225
+ no_color=no_color,
156
226
  )
157
227
 
158
228
 
@@ -184,18 +254,18 @@ def _register_subapps() -> None:
184
254
  @app.command(
185
255
  name="verify",
186
256
  help=(
187
- "Verify certificate JSON(s) against schema, pairing math, and gates. "
257
+ "Verify evaluation report JSON(s) against schema, pairing math, and gates. "
188
258
  "Use --json for a single-line machine-readable envelope."
189
259
  ),
190
260
  )
191
261
  def _verify_typed(
192
- certificates: list[str] = typer.Argument(
193
- ..., help="One or more certificate JSON files to verify."
262
+ reports: list[str] = typer.Argument(
263
+ ..., help="One or more evaluation report JSON files to verify."
194
264
  ),
195
265
  baseline: str | None = typer.Option(
196
266
  None,
197
267
  "--baseline",
198
- help="Optional baseline certificate/report JSON to enforce provider parity.",
268
+ help="Optional baseline evaluation report JSON to enforce provider parity.",
199
269
  ),
200
270
  tolerance: float = typer.Option(
201
271
  1e-9, "--tolerance", help="Tolerance for analysis-basis comparisons."
@@ -215,10 +285,10 @@ def _verify_typed(
215
285
 
216
286
  from .commands.verify import verify_command as _verify
217
287
 
218
- cert_paths = [_Path(c) for c in certificates]
288
+ report_paths = [_Path(p) for p in reports]
219
289
  baseline_path = _Path(baseline) if isinstance(baseline, str) else None
220
290
  return _verify(
221
- certificates=cert_paths,
291
+ reports=report_paths,
222
292
  baseline=baseline_path,
223
293
  tolerance=tolerance,
224
294
  profile=profile,
@@ -230,7 +300,7 @@ def _verify_typed(
230
300
  name="run",
231
301
  help=(
232
302
  "Execute an end-to-end run from a YAML config (edit + guards + reports). "
233
- "Writes run artifacts and optionally a safety certificate."
303
+ "Writes run artifacts and optionally an evaluation report."
234
304
  ),
235
305
  )
236
306
  def _run_typed(
@@ -245,16 +315,31 @@ def _run_typed(
245
315
  ),
246
316
  out: str | None = typer.Option(None, "--out", help="Output directory override"),
247
317
  edit: str | None = typer.Option(None, "--edit", help="Edit kind (quant|mixed)"),
318
+ edit_label: str | None = typer.Option(
319
+ None,
320
+ "--edit-label",
321
+ help=(
322
+ "Edit algorithm label for BYOE models. Use 'noop' for baseline, "
323
+ "'quant_rtn' etc. for built-in edits, 'custom' for pre-edited models."
324
+ ),
325
+ ),
248
326
  tier: str | None = typer.Option(
249
327
  None,
250
328
  "--tier",
251
329
  help="Auto-tuning tier override (conservative|balanced|aggressive)",
252
330
  ),
331
+ metric_kind: str | None = typer.Option(
332
+ None,
333
+ "--metric-kind",
334
+ help="Primary metric kind override (ppl_causal|ppl_mlm|accuracy|etc.)",
335
+ ),
253
336
  probes: int | None = typer.Option(
254
337
  None, "--probes", help="Number of micro-probes (0=deterministic, >0=adaptive)"
255
338
  ),
256
339
  until_pass: bool = typer.Option(
257
- False, "--until-pass", help="Retry until certificate passes (max 3 attempts)"
340
+ False,
341
+ "--until-pass",
342
+ help="Retry until evaluation report passes gates (max 3 attempts)",
258
343
  ),
259
344
  max_attempts: int = typer.Option(
260
345
  3, "--max-attempts", help="Maximum retry attempts for --until-pass mode"
@@ -265,11 +350,21 @@ def _run_typed(
265
350
  baseline: str | None = typer.Option(
266
351
  None,
267
352
  "--baseline",
268
- help="Path to baseline report.json for certificate validation",
353
+ help="Path to baseline report.json for evaluation report validation",
269
354
  ),
270
355
  no_cleanup: bool = typer.Option(
271
356
  False, "--no-cleanup", help="Skip cleanup of temporary artifacts"
272
357
  ),
358
+ style: str | None = typer.Option(
359
+ None, "--style", help="Output style (audit|friendly)"
360
+ ),
361
+ progress: bool = typer.Option(
362
+ False, "--progress", help="Show progress done messages"
363
+ ),
364
+ timing: bool = typer.Option(False, "--timing", help="Show timing summary"),
365
+ no_color: bool = typer.Option(
366
+ False, "--no-color", help="Disable ANSI colors (respects NO_COLOR=1)"
367
+ ),
273
368
  ):
274
369
  from .commands.run import run_command as _run
275
370
 
@@ -279,13 +374,19 @@ def _run_typed(
279
374
  profile=profile,
280
375
  out=out,
281
376
  edit=edit,
377
+ edit_label=edit_label,
282
378
  tier=tier,
379
+ metric_kind=metric_kind,
283
380
  probes=probes,
284
381
  until_pass=until_pass,
285
382
  max_attempts=max_attempts,
286
383
  timeout=timeout,
287
384
  baseline=baseline,
288
385
  no_cleanup=no_cleanup,
386
+ style=style,
387
+ progress=progress,
388
+ timing=timing,
389
+ no_color=no_color,
289
390
  )
290
391
 
291
392
 
@@ -5,8 +5,8 @@ Import-time work is minimal; subcommands themselves may perform heavier imports
5
5
  only when invoked.
6
6
  """
7
7
 
8
- from .certify import certify_command
9
8
  from .doctor import doctor_command
9
+ from .evaluate import evaluate_command
10
10
  from .explain_gates import explain_gates_command
11
11
  from .export_html import export_html_command
12
12
  from .plugins import plugins_command
@@ -15,7 +15,7 @@ from .run import run_command
15
15
  from .verify import verify_command
16
16
 
17
17
  __all__ = [
18
- "certify_command",
18
+ "evaluate_command",
19
19
  "doctor_command",
20
20
  "explain_gates_command",
21
21
  "export_html_command",
@@ -20,10 +20,6 @@ import typer
20
20
  import yaml
21
21
  from rich.console import Console
22
22
 
23
- from invarlock.calibration.spectral_null import summarize_null_sweep_reports
24
- from invarlock.calibration.variance_ve import summarize_ve_sweep_reports
25
- from invarlock.guards.tier_config import get_tier_guard_config
26
-
27
23
  console = Console()
28
24
 
29
25
  calibrate_app = typer.Typer(
@@ -114,6 +110,26 @@ def _write_tiers_recommendation(
114
110
  )
115
111
 
116
112
 
113
+ def get_tier_guard_config(tier: str, guard_key: str) -> dict[str, Any]:
114
+ """Lazy wrapper for tier config lookup.
115
+
116
+ This is intentionally a module-level symbol so tests can patch it without
117
+ importing torch/transformers at import time.
118
+ """
119
+ try:
120
+ from invarlock.guards.tier_config import get_tier_guard_config as _get_cfg
121
+ except ModuleNotFoundError as exc:
122
+ missing = getattr(exc, "name", "") or ""
123
+ if missing in {"torch", "transformers"}:
124
+ console.print(
125
+ "[red]Missing optional dependencies for calibration.[/red] "
126
+ "Install `invarlock[hf]` (or at least torch/transformers) to run sweeps."
127
+ )
128
+ raise typer.Exit(1) from exc
129
+ raise
130
+ return _get_cfg(tier, guard_key)
131
+
132
+
117
133
  @calibrate_app.command(
118
134
  name="null-sweep",
119
135
  help="Run a null (no-op edit) sweep and calibrate spectral κ/alpha empirically.",
@@ -160,6 +176,21 @@ def null_sweep(
160
176
  # Keep import light: only pull run machinery when invoked.
161
177
  from .run import run_command
162
178
 
179
+ # Optional deps: calibration sweeps require torch/guards, but docs/tests may
180
+ # import this module without heavy deps. Import lazily so CLI example
181
+ # validation can parse `invarlock calibrate ...` without installing torch.
182
+ try:
183
+ from invarlock.calibration.spectral_null import summarize_null_sweep_reports
184
+ except ModuleNotFoundError as exc:
185
+ missing = getattr(exc, "name", "") or ""
186
+ if missing in {"torch", "transformers"}:
187
+ console.print(
188
+ "[red]Missing optional dependencies for calibration.[/red] "
189
+ "Install `invarlock[hf]` (or at least torch/transformers) to run sweeps."
190
+ )
191
+ raise typer.Exit(1) from exc
192
+ raise
193
+
163
194
  base = _load_yaml(config)
164
195
  specs = _materialize_sweep_specs(
165
196
  tiers=tiers, seeds=seed, n_seeds=n_seeds, seed_start=seed_start
@@ -378,6 +409,19 @@ def ve_sweep(
378
409
  # Keep import light: only pull run machinery when invoked.
379
410
  from .run import run_command
380
411
 
412
+ # Optional deps: see null_sweep() note.
413
+ try:
414
+ from invarlock.calibration.variance_ve import summarize_ve_sweep_reports
415
+ except ModuleNotFoundError as exc:
416
+ missing = getattr(exc, "name", "") or ""
417
+ if missing in {"torch", "transformers"}:
418
+ console.print(
419
+ "[red]Missing optional dependencies for calibration.[/red] "
420
+ "Install `invarlock[hf]` (or at least torch/transformers) to run sweeps."
421
+ )
422
+ raise typer.Exit(1) from exc
423
+ raise
424
+
381
425
  base = _load_yaml(config)
382
426
  windows = [int(w) for w in (window or [])] or [6, 8, 12, 16]
383
427
  specs = _materialize_sweep_specs(
@@ -1042,8 +1042,7 @@ def doctor_command(
1042
1042
  module = str(info.get("module") or "")
1043
1043
  support = (
1044
1044
  "auto"
1045
- if module.startswith("invarlock.adapters")
1046
- and n in {"hf_causal_auto", "hf_mlm_auto"}
1045
+ if module.startswith("invarlock.adapters") and n in {"hf_auto"}
1047
1046
  else (
1048
1047
  "core"
1049
1048
  if module.startswith("invarlock.adapters")
@@ -1058,11 +1057,10 @@ def doctor_command(
1058
1057
 
1059
1058
  # Heuristic backend mapping without heavy imports
1060
1059
  if n in {
1061
- "hf_gpt2",
1062
- "hf_bert",
1063
- "hf_llama",
1064
- "hf_causal_auto",
1065
- "hf_mlm_auto",
1060
+ "hf_causal",
1061
+ "hf_mlm",
1062
+ "hf_seq2seq",
1063
+ "hf_auto",
1066
1064
  }:
1067
1065
  # Transformers-based
1068
1066
  backend = "transformers"
@@ -1097,8 +1095,8 @@ def doctor_command(
1097
1095
  }.get(n)
1098
1096
  if hint:
1099
1097
  enable = f"pip install '{hint}'"
1100
- # Special-case: hf_onnx is a core adapter but requires Optimum/ONNXRuntime
1101
- if n == "hf_onnx":
1098
+ # Special-case: ONNX causal adapter is core but requires Optimum/ONNXRuntime
1099
+ if n == "hf_causal_onnx":
1102
1100
  backend = backend or "onnxruntime"
1103
1101
  present = (
1104
1102
  importlib.util.find_spec("optimum.onnxruntime") is not None
@@ -1322,7 +1320,7 @@ def doctor_command(
1322
1320
  if "optimum" in str(e).lower():
1323
1321
  if not json_out:
1324
1322
  console.print(
1325
- " [yellow]⚠️ Optional Optimum/ONNXRuntime missing; hf_onnx will be shown as needs_extra[/yellow]"
1323
+ " [yellow]⚠️ Optional Optimum/ONNXRuntime missing; hf_causal_onnx will be shown as needs_extra[/yellow]"
1326
1324
  )
1327
1325
  # Do not mark overall health as failed for optional extras
1328
1326
  else: