PyPI - invarlock - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl - Mend

invarlock 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

invarlock/__init__.py +4 -4
invarlock/adapters/__init__.py +10 -14
invarlock/adapters/auto.py +37 -50
invarlock/adapters/capabilities.py +2 -2
invarlock/adapters/hf_causal.py +418 -0
invarlock/adapters/{hf_onnx.py → hf_causal_onnx.py} +3 -3
invarlock/adapters/hf_loading.py +7 -7
invarlock/adapters/hf_mixin.py +53 -9
invarlock/adapters/{hf_bert.py → hf_mlm.py} +4 -11
invarlock/adapters/{hf_t5.py → hf_seq2seq.py} +9 -9
invarlock/assurance/__init__.py +15 -23
invarlock/cli/adapter_auto.py +32 -26
invarlock/cli/app.py +128 -27
invarlock/cli/commands/__init__.py +2 -2
invarlock/cli/commands/calibrate.py +48 -4
invarlock/cli/commands/doctor.py +8 -10
invarlock/cli/commands/evaluate.py +986 -0
invarlock/cli/commands/explain_gates.py +25 -17
invarlock/cli/commands/export_html.py +11 -9
invarlock/cli/commands/plugins.py +13 -9
invarlock/cli/commands/report.py +326 -92
invarlock/cli/commands/run.py +1160 -228
invarlock/cli/commands/verify.py +157 -97
invarlock/cli/config.py +1 -1
invarlock/cli/determinism.py +1 -1
invarlock/cli/doctor_helpers.py +4 -5
invarlock/cli/output.py +193 -0
invarlock/cli/provenance.py +4 -4
invarlock/core/bootstrap.py +1 -1
invarlock/core/registry.py +9 -11
invarlock/core/retry.py +14 -14
invarlock/core/runner.py +112 -26
invarlock/edits/noop.py +2 -2
invarlock/edits/quant_rtn.py +67 -39
invarlock/eval/__init__.py +1 -1
invarlock/eval/bench.py +14 -10
invarlock/eval/data.py +68 -23
invarlock/eval/metrics.py +59 -1
invarlock/eval/primary_metric.py +1 -1
invarlock/eval/tasks/__init__.py +12 -0
invarlock/eval/tasks/classification.py +48 -0
invarlock/eval/tasks/qa.py +36 -0
invarlock/eval/tasks/text_generation.py +102 -0
invarlock/guards/invariants.py +19 -10
invarlock/guards/rmt.py +2 -2
invarlock/guards/spectral.py +1 -1
invarlock/guards/variance.py +2 -2
invarlock/model_profile.py +64 -62
invarlock/observability/health.py +6 -6
invarlock/observability/metrics.py +108 -0
invarlock/plugins/hf_bnb_adapter.py +32 -21
invarlock/reporting/__init__.py +18 -4
invarlock/reporting/guards_analysis.py +154 -4
invarlock/reporting/html.py +61 -11
invarlock/reporting/normalizer.py +9 -2
invarlock/reporting/policy_utils.py +1 -1
invarlock/reporting/primary_metric_utils.py +11 -11
invarlock/reporting/render.py +876 -510
invarlock/reporting/report.py +72 -30
invarlock/reporting/{certificate.py → report_builder.py} +252 -99
invarlock/reporting/{certificate_schema.py → report_schema.py} +22 -22
invarlock/reporting/report_types.py +6 -1
invarlock/reporting/telemetry.py +86 -0
invarlock-0.3.8.dist-info/METADATA +283 -0
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/RECORD +69 -64
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/WHEEL +1 -1
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/entry_points.txt +5 -3
invarlock/adapters/hf_gpt2.py +0 -404
invarlock/adapters/hf_llama.py +0 -487
invarlock/cli/commands/certify.py +0 -422
invarlock-0.3.6.dist-info/METADATA +0 -588
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/licenses/LICENSE +0 -0
{invarlock-0.3.6.dist-info → invarlock-0.3.8.dist-info}/top_level.txt +0 -0

invarlock/cli/commands/verify.py CHANGED Viewed

@@ -2,7 +2,7 @@
 invarlock verify command
 ====================
-Validates generated safety certificates for internal consistency. The command
+Validates generated evaluation reports for internal consistency. The command
 ensures schema compliance, checks that the primary metric ratio agrees with the
 baseline reference, and enforces paired-window guarantees (match=1.0,
 overlap=0.0).
@@ -26,7 +26,7 @@ from invarlock.core.exceptions import (
 from invarlock.core.exceptions import (
     ValidationError as _ValidationError,
 )
-from invarlock.reporting.certificate import validate_certificate
+from invarlock.reporting.report_builder import validate_report
 from .._json import emit as _emit_json
 from .._json import encode_error as _encode_error
@@ -52,26 +52,42 @@ def _coerce_int(value: Any) -> int | None:
     return out if out >= 0 else None
-def _load_certificate(path: Path) -> dict[str, Any]:
-    """Load certificate JSON from disk."""
+def _load_evaluation_report(path: Path) -> dict[str, Any]:
+    """Load an evaluation report JSON from disk."""
     with path.open("r", encoding="utf-8") as handle:
         return json.load(handle)
-def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
+def _validate_primary_metric(report: dict[str, Any]) -> list[str]:
     """Validate primary metric ratio consistency with baseline reference."""
     errors: list[str] = []
-    pm = certificate.get("primary_metric", {}) or {}
+    pm = report.get("primary_metric", {}) or {}
     if not isinstance(pm, dict) or not pm:
-        errors.append("Certificate missing primary_metric block.")
+        errors.append("report missing primary_metric block.")
         return errors
+    def _is_finite_number(value: Any) -> bool:
+        return isinstance(value, (int, float)) and math.isfinite(float(value))
+    def _declares_invalid_primary_metric(metric: dict[str, Any]) -> bool:
+        if bool(metric.get("invalid")):
+            return True
+        reason = metric.get("degraded_reason")
+        if isinstance(reason, str):
+            r = reason.strip().lower()
+            return r.startswith("non_finite") or r in {
+                "primary_metric_invalid",
+                "evaluation_error",
+            }
+        return False
     kind = str(pm.get("kind", "")).lower()
     ratio_vs_baseline = pm.get("ratio_vs_baseline")
     final = pm.get("final")
+    pm_invalid = _declares_invalid_primary_metric(pm)
     if kind.startswith("ppl"):
-        baseline_ref = certificate.get("baseline_ref", {}) or {}
+        baseline_ref = report.get("baseline_ref", {}) or {}
         baseline_pm = (
             baseline_ref.get("primary_metric")
             if isinstance(baseline_ref, dict)
@@ -82,18 +98,16 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
             bv = baseline_pm.get("final")
             if isinstance(bv, (int | float)):
                 baseline_final = float(bv)
-        if isinstance(final, int | float) and isinstance(baseline_final, int | float):
-            if baseline_final <= 0.0:
+        if _is_finite_number(final) and _is_finite_number(baseline_final):
+            if float(baseline_final) <= 0.0:
                 errors.append(
                     f"Baseline final must be > 0.0 to compute ratio (found {baseline_final})."
                 )
             else:
                 expected_ratio = float(final) / float(baseline_final)
-                if not isinstance(ratio_vs_baseline, int | float) or not math.isfinite(
-                    float(ratio_vs_baseline)
-                ):
+                if not _is_finite_number(ratio_vs_baseline):
                     errors.append(
-                        "Certificate is missing a finite primary_metric.ratio_vs_baseline value."
+                        "report is missing a finite primary_metric.ratio_vs_baseline value."
                     )
                 elif not math.isclose(
                     float(ratio_vs_baseline), expected_ratio, rel_tol=1e-6, abs_tol=1e-6
@@ -102,19 +116,30 @@ def _validate_primary_metric(certificate: dict[str, Any]) -> list[str]:
                         "Primary metric ratio mismatch: "
                         f"recorded={float(ratio_vs_baseline):.12f}, expected={expected_ratio:.12f}"
                     )
+        else:
+            # If the primary metric is non-finite, it must be explicitly marked invalid.
+            # This is expected for structural error-injection runs (NaN/Inf weights).
+            if (isinstance(final, (int | float)) and not _is_finite_number(final)) and (
+                not pm_invalid
+            ):
+                errors.append(
+                    "Primary metric final is non-finite but primary_metric.invalid is not set."
+                )
     else:
+        if pm_invalid:
+            return errors
         if ratio_vs_baseline is None or not isinstance(ratio_vs_baseline, int | float):
             errors.append(
-                "Certificate missing primary_metric.ratio_vs_baseline for non-ppl metric."
+                "report missing primary_metric.ratio_vs_baseline for non-ppl metric."
             )
     return errors
-def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
+def _validate_pairing(report: dict[str, Any]) -> list[str]:
     """Validate window pairing metrics (PM-only location)."""
     errors: list[str] = []
-    stats = certificate.get("dataset", {}).get("windows", {}).get("stats", {})
+    stats = report.get("dataset", {}).get("windows", {}).get("stats", {})
     match_fraction = stats.get("window_match_fraction")
     overlap_fraction = stats.get("window_overlap_fraction")
@@ -123,23 +148,23 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
     if pairing_reason is not None:
         errors.append(
-            "window_pairing_reason must be null/None for paired certificates "
+            "window_pairing_reason must be null/None for paired reports "
             f"(found {pairing_reason!r})."
         )
     if paired_windows is None:
-        errors.append("Certificate missing paired_windows metric.")
+        errors.append("report missing paired_windows metric.")
     elif paired_windows == 0:
-        errors.append("paired_windows must be > 0 for paired certificates (found 0).")
+        errors.append("paired_windows must be > 0 for paired reports (found 0).")
     if match_fraction is None:
-        errors.append("Certificate missing window_match_fraction metric.")
+        errors.append("report missing window_match_fraction metric.")
     elif match_fraction < 0.999999:
         errors.append(
             f"window_match_fraction must be 1.0 for paired runs (found {match_fraction:.6f})."
         )
     if overlap_fraction is None:
-        errors.append("Certificate missing window_overlap_fraction metric.")
+        errors.append("report missing window_overlap_fraction metric.")
     elif overlap_fraction > 1e-9:
         errors.append(
             f"window_overlap_fraction must be 0.0 (found {overlap_fraction:.6f})."
@@ -148,10 +173,10 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
     return errors
-def _validate_counts(certificate: dict[str, Any]) -> list[str]:
+def _validate_counts(report: dict[str, Any]) -> list[str]:
     """Validate preview/final window counts align with dataset configuration."""
     errors: list[str] = []
-    dataset = certificate.get("dataset", {})
+    dataset = report.get("dataset", {})
     dataset_windows = dataset.get("windows", {})
     expected_preview = dataset_windows.get("preview")
     expected_final = dataset_windows.get("final")
@@ -165,9 +190,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
     if expected_preview is not None:
         if preview_used is None:
-            errors.append(
-                "Certificate missing coverage.preview.used for preview windows."
-            )
+            errors.append("report missing coverage.preview.used for preview windows.")
         elif int(preview_used) != int(expected_preview):
             errors.append(
                 f"Preview window count mismatch: expected {expected_preview}, observed {preview_used}."
@@ -175,7 +198,7 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
     if expected_final is not None:
         if final_used is None:
-            errors.append("Certificate missing coverage.final.used for final windows.")
+            errors.append("report missing coverage.final.used for final windows.")
         elif int(final_used) != int(expected_final):
             errors.append(
                 f"Final window count mismatch: expected {expected_final}, observed {final_used}."
@@ -193,40 +216,79 @@ def _validate_counts(certificate: dict[str, Any]) -> list[str]:
     return errors
-def _validate_drift_band(certificate: dict[str, Any]) -> list[str]:
-    """Validate preview→final drift stays within the configured band (0.95–1.05)."""
+def _validate_drift_band(report: dict[str, Any]) -> list[str]:
+    """Validate preview→final drift stays within the configured band.
+    Defaults to 0.95–1.05 unless the report provides `primary_metric.drift_band`.
+    """
     errors: list[str] = []
-    pm = certificate.get("primary_metric", {}) or {}
+    pm = report.get("primary_metric", {}) or {}
+    if not isinstance(pm, dict) or not pm:
+        errors.append("report missing primary_metric block.")
+        return errors
+    if bool(pm.get("invalid")):
+        # Drift is undefined when the primary metric is invalid (e.g., NaN/Inf weights).
+        return errors
     drift_ratio = None
     try:
         prev = pm.get("preview")
         fin = pm.get("final")
-        if isinstance(prev, int | float) and isinstance(fin, int | float) and prev > 0:
+        if (
+            isinstance(prev, int | float)
+            and isinstance(fin, int | float)
+            and math.isfinite(float(prev))
+            and math.isfinite(float(fin))
+            and prev > 0
+        ):
             drift_ratio = float(fin) / float(prev)
     except Exception:
         drift_ratio = None
     if not isinstance(drift_ratio, int | float):
-        errors.append("Certificate missing preview/final to compute drift ratio.")
+        errors.append("report missing preview/final to compute drift ratio.")
         return errors
-    if not 0.95 <= float(drift_ratio) <= 1.05:
+    drift_min = 0.95
+    drift_max = 1.05
+    band = pm.get("drift_band")
+    try:
+        if isinstance(band, dict):
+            lo = band.get("min")
+            hi = band.get("max")
+            if isinstance(lo, int | float) and isinstance(hi, int | float):
+                lo_f = float(lo)
+                hi_f = float(hi)
+                if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
+                    drift_min = lo_f
+                    drift_max = hi_f
+        elif isinstance(band, list | tuple) and len(band) == 2:
+            lo_raw, hi_raw = band[0], band[1]
+            if isinstance(lo_raw, int | float) and isinstance(hi_raw, int | float):
+                lo_f = float(lo_raw)
+                hi_f = float(hi_raw)
+                if math.isfinite(lo_f) and math.isfinite(hi_f) and 0 < lo_f < hi_f:
+                    drift_min = lo_f
+                    drift_max = hi_f
+    except Exception:
+        pass
+    if not drift_min <= float(drift_ratio) <= drift_max:
         errors.append(
-            f"Preview→final drift ratio out of band (0.95–1.05): observed {drift_ratio:.6f}."
+            f"Preview→final drift ratio out of band ({drift_min:.2f}–{drift_max:.2f}): observed {drift_ratio:.6f}."
         )
     return errors
-def _validate_tokenizer_hash(certificate: dict[str, Any]) -> list[str]:
+def _validate_tokenizer_hash(report: dict[str, Any]) -> list[str]:
     """Validate tokenizer hash consistency between baseline and edited runs.
     The check is enforced only when both hashes are present. When present and
     different, the verification fails.
     """
     errors: list[str] = []
-    meta = certificate.get("meta", {}) or {}
-    dataset = certificate.get("dataset", {}) or {}
+    meta = report.get("meta", {}) or {}
+    dataset = report.get("dataset", {}) or {}
     edited_hash = None
     try:
         # Prefer meta.tokenizer_hash; fall back to dataset.tokenizer.hash
@@ -238,7 +300,7 @@ def _validate_tokenizer_hash(certificate: dict[str, Any]) -> list[str]:
     except Exception:
         edited_hash = None
-    baseline_ref = certificate.get("baseline_ref", {}) or {}
+    baseline_ref = report.get("baseline_ref", {}) or {}
     baseline_hash = baseline_ref.get("tokenizer_hash")
     if isinstance(edited_hash, str) and isinstance(baseline_hash, str):
@@ -270,15 +332,15 @@ def _measurement_contract_digest(contract: Any) -> str | None:
 def _validate_measurement_contracts(
-    certificate: dict[str, Any], *, profile: str
+    report: dict[str, Any], *, profile: str
 ) -> list[str]:
     """Enforce measurement-contract presence and baseline pairing for guards."""
     errors: list[str] = []
     prof = (profile or "").strip().lower()
-    resolved_policy = certificate.get("resolved_policy") or {}
+    resolved_policy = report.get("resolved_policy") or {}
     for guard_key in ("spectral", "rmt"):
-        block = certificate.get(guard_key) or {}
+        block = report.get(guard_key) or {}
         if not isinstance(block, dict):
             continue
         evaluated = bool(block.get("evaluated", True))
@@ -289,14 +351,14 @@ def _validate_measurement_contracts(
         mc_hash = _measurement_contract_digest(mc)
         expected_hash = block.get("measurement_contract_hash")
         if not isinstance(mc, dict) or not mc:
-            errors.append(f"Certificate missing {guard_key}.measurement_contract.")
+            errors.append(f"report missing {guard_key}.measurement_contract.")
         elif isinstance(expected_hash, str) and expected_hash:
             if mc_hash and mc_hash != expected_hash:
                 errors.append(
                     f"{guard_key}.measurement_contract_hash mismatch: expected={expected_hash}, computed={mc_hash}."
                 )
         else:
-            errors.append(f"Certificate missing {guard_key}.measurement_contract_hash.")
+            errors.append(f"report missing {guard_key}.measurement_contract_hash.")
         rp_guard = (
             resolved_policy.get(guard_key)
@@ -309,7 +371,7 @@ def _validate_measurement_contracts(
         rp_hash = _measurement_contract_digest(rp_mc)
         if not isinstance(rp_mc, dict) or not rp_mc:
             errors.append(
-                f"Certificate missing resolved_policy.{guard_key}.measurement_contract."
+                f"report missing resolved_policy.{guard_key}.measurement_contract."
             )
         elif mc_hash and rp_hash and mc_hash != rp_hash:
             errors.append(
@@ -327,10 +389,10 @@ def _validate_measurement_contracts(
     return errors
-def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
-    """Apply model-profile specific lint rules embedded in the certificate."""
+def _apply_profile_lints(report: dict[str, Any]) -> list[str]:
+    """Apply model-profile specific lint rules embedded in the report."""
     errors: list[str] = []
-    meta = certificate.get("meta", {})
+    meta = report.get("meta", {})
     profile = meta.get("model_profile") if isinstance(meta, dict) else None
     if not isinstance(profile, dict):
         return errors
@@ -346,7 +408,7 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
         path = lint.get("path")
         expected = lint.get("value")
         message = lint.get("message") or "Model profile lint failed."
-        actual = _resolve_path(certificate, path) if isinstance(path, str) else None
+        actual = _resolve_path(report, path) if isinstance(path, str) else None
         if lint_type == "equals":
             if actual != expected:
@@ -383,21 +445,21 @@ def _apply_profile_lints(certificate: dict[str, Any]) -> list[str]:
     return errors
-def _validate_certificate_payload(
+def _validate_evaluation_report_payload(
     path: Path, *, profile: str | None = None
 ) -> list[str]:
-    """Run all verification checks for a single certificate."""
+    """Run all verification checks for a single evaluation report."""
     errors: list[str] = []
-    certificate = _load_certificate(path)
+    report = _load_evaluation_report(path)
     # Always surface schema validation failures for this payload
-    if not validate_certificate(certificate):
-        errors.append("Certificate schema validation failed.")
+    if not validate_report(report):
+        errors.append("report schema validation failed.")
         return errors
-    errors.extend(_validate_primary_metric(certificate))
-    errors.extend(_validate_pairing(certificate))
-    errors.extend(_validate_counts(certificate))
+    errors.extend(_validate_primary_metric(report))
+    errors.extend(_validate_pairing(report))
+    errors.extend(_validate_counts(report))
     try:
         prof = (
             (profile or "").strip().lower()
@@ -406,24 +468,25 @@ def _validate_certificate_payload(
         )
     except Exception:
         prof = "dev"
-    # Enforce drift band only for CI/Release; skip in dev profile
+    # Drift band is a CI/Release enforcement check; dev profile should not
+    # fail verification due to preview→final drift.
     if prof in {"ci", "release"}:
-        errors.extend(_validate_drift_band(certificate))
-    errors.extend(_apply_profile_lints(certificate))
-    errors.extend(_validate_tokenizer_hash(certificate))
+        errors.extend(_validate_drift_band(report))
+    errors.extend(_apply_profile_lints(report))
+    errors.extend(_validate_tokenizer_hash(report))
     if prof in {"ci", "release"}:
-        errors.extend(_validate_measurement_contracts(certificate, profile=prof))
+        errors.extend(_validate_measurement_contracts(report, profile=prof))
     # strict/fast assurance mode checks were removed; verification gates rely on
     # structural schema + guard metric contracts instead.
     # Release-only enforcement: guard overhead must be measured or explicitly skipped.
     if prof == "release":
-        go = certificate.get("guard_overhead")
+        go = report.get("guard_overhead")
         if not isinstance(go, dict) or not go:
             errors.append(
                 "Release verification requires guard_overhead (missing). "
-                "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during certification."
+                "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
             )
         else:
             skipped = bool(go.get("skipped", False)) or (
@@ -434,7 +497,7 @@ def _validate_certificate_payload(
                 if evaluated is not True:
                     errors.append(
                         "Release verification requires evaluated guard_overhead (not evaluated). "
-                        "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during certification."
+                        "Set INVARLOCK_SKIP_OVERHEAD_CHECK=1 to explicitly skip during evaluation."
                     )
                 ratio = go.get("overhead_ratio")
                 if ratio is None:
@@ -446,14 +509,14 @@ def _validate_certificate_payload(
     return errors
-def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any]) -> None:
+def _warn_adapter_family_mismatch(cert_path: Path, report: dict[str, Any]) -> None:
     """Emit a soft warning if adapter families differ between baseline and edited.
     This is a non-fatal hint to catch inadvertent cross-family comparisons.
-    Tries to load the baseline report referenced in the certificate provenance.
+    Tries to load the baseline report referenced in the report provenance.
     """
     try:
-        plugins = certificate.get("plugins") or {}
+        plugins = report.get("plugins") or {}
         adapter_meta = plugins.get("adapter") if isinstance(plugins, dict) else None
         edited_family = None
         edited_lib = None
@@ -466,8 +529,8 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
                 edited_ver = prov.get("version") or None
         baseline_prov = (
-            certificate.get("provenance")
-            if isinstance(certificate.get("provenance"), dict)
+            report.get("provenance")
+            if isinstance(report.get("provenance"), dict)
             else {}
         )
         baseline_report_path = None
@@ -517,7 +580,7 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
                 f"[yellow]   • edited  : family={edited_family}, backend={edited_backend} {edited_version}[/yellow]"
             )
             console.print(
-                "[yellow]   Ensure this cross-family comparison is intentional (Compare & Certify flows should normally match families).[/yellow]"
+                "[yellow]   Ensure this cross-family comparison is intentional (Compare & Evaluate flows should normally match families).[/yellow]"
             )
     except Exception:
         # Non-fatal and best-effort; suppress errors
@@ -525,18 +588,18 @@ def _warn_adapter_family_mismatch(cert_path: Path, certificate: dict[str, Any])
 def verify_command(
-    certificates: list[Path] = typer.Argument(
+    reports: list[Path] = typer.Argument(
         ...,
         exists=True,
         dir_okay=False,
         readable=True,
         resolve_path=True,
-        help="One or more certificate JSON files to verify.",
+        help="One or more evaluation report JSON files to verify.",
     ),
     baseline: Path | None = typer.Option(
         None,
         "--baseline",
-        help="Optional baseline certificate/report JSON to enforce provider parity.",
+        help="Optional baseline evaluation report (or run report) JSON to enforce provider parity.",
     ),
     tolerance: float = typer.Option(
         1e-9,
@@ -555,9 +618,9 @@ def verify_command(
     ),
 ) -> None:
     """
-    Verify certificate integrity.
+    Verify evaluation report integrity.
-    Ensures each certificate passes schema validation, ratio consistency checks,
+    Ensures each evaluation report passes schema validation, ratio consistency checks,
     and strict pairing requirements (match=1.0, overlap=0.0).
     """
@@ -573,7 +636,7 @@ def verify_command(
     try:
         if baseline is not None:
             bdata = json.loads(baseline.read_text(encoding="utf-8"))
-            # Accept either a certificate or a raw report; look under provenance when present
+            # Accept either an evaluation report or a run report (report.json); look under provenance when present.
             prov = bdata.get("provenance") if isinstance(bdata, dict) else None
             if isinstance(prov, dict):
                 pd = prov.get("provider_digest")
@@ -585,8 +648,8 @@ def verify_command(
     malformed_any = False
     try:
-        for cert_path in certificates:
-            cert_obj = _load_certificate(cert_path)
+        for cert_path in reports:
+            cert_obj = _load_evaluation_report(cert_path)
             # Enforce provider digest presence in CI/Release profiles
             try:
@@ -614,24 +677,21 @@ def verify_command(
                     )
             # Structural checks
-            errors = _validate_certificate_payload(cert_path, profile=profile)
+            errors = _validate_evaluation_report_payload(cert_path, profile=profile)
             # JSON path: emit a typed ValidationError for schema failures to include error.code
             if json_out and any(
                 "schema validation failed" in str(e).lower() for e in errors
             ):
                 raise _ValidationError(
                     code="E601",
-                    message="CERTIFICATE-SCHEMA-INVALID: schema validation failed",
+                    message="REPORT-SCHEMA-INVALID: schema validation failed",
                     details={"path": str(cert_path)},
                 )
             # Determine malformed vs policy-fail for this cert
             is_malformed = any(
                 ("schema validation failed" in e.lower())
                 or ("missing primary_metric.ratio_vs_baseline" in e)
-                or (
-                    "Certificate is missing a finite primary_metric.ratio_vs_baseline"
-                    in e
-                )
+                or ("report is missing a finite primary_metric.ratio_vs_baseline" in e)
                 for e in errors
             )
             malformed_any = malformed_any or is_malformed
@@ -748,7 +808,7 @@ def verify_command(
                 )
                 raise _MetricsError(
                     code="E602",
-                    message="RECOMPUTE-MISMATCH: certificate values disagree with recomputation",
+                    message="RECOMPUTE-MISMATCH: report values disagree with recomputation",
                     details={"example": str(first)},
                 )
@@ -770,11 +830,11 @@ def verify_command(
         if not overall_ok:
             code = 2 if malformed_any else 1
             if json_out:
-                # Build per-certificate results payload
+                # Build per-report results payload
                 results: list[dict[str, Any]] = []
-                for cert_path in certificates:
+                for cert_path in reports:
                     try:
-                        cert_obj = _load_certificate(cert_path)
+                        cert_obj = _load_evaluation_report(cert_path)
                     except Exception:
                         cert_obj = {}
                     pm = (
@@ -915,7 +975,7 @@ def verify_command(
                         "ok": False,
                         "reason": "malformed" if malformed_any else "policy_fail",
                     },
-                    "certificate": {"count": len(certificates)},
+                    "evaluation_report": {"count": len(reports)},
                     "results": results,
                     "resolution": {"exit_code": code},
                 }
@@ -924,11 +984,11 @@ def verify_command(
         # Success emission
         if json_out:
-            # Build per-certificate success results payload
+            # Build per-report success results payload
             results: list[dict[str, Any]] = []
-            for cert_path in certificates:
+            for cert_path in reports:
                 try:
-                    cert_obj = _load_certificate(cert_path)
+                    cert_obj = _load_evaluation_report(cert_path)
                 except Exception:
                     cert_obj = {}
                 pm = (
@@ -1057,7 +1117,7 @@ def verify_command(
             payload = {
                 "format_version": FORMAT_VERIFY,
                 "summary": {"ok": True, "reason": "ok"},
-                "certificate": {"count": len(certificates)},
+                "evaluation_report": {"count": len(reports)},
                 "results": results,
                 "resolution": {"exit_code": 0},
             }
@@ -1065,7 +1125,7 @@ def verify_command(
         else:
             # Human-friendly success line
             try:
-                last = _load_certificate(certificates[-1]) if certificates else {}
+                last = _load_evaluation_report(reports[-1]) if reports else {}
                 pm = last.get("primary_metric", {}) if isinstance(last, dict) else {}
                 kind = str(pm.get("kind") or "").strip()
                 ppl = last.get("ppl", {}) if isinstance(last, dict) else {}
@@ -1116,7 +1176,7 @@ def verify_command(
                 "summary": {"ok": False, "reason": reason},
                 "results": [
                     {
-                        "id": str(certificates[0]) if certificates else "",
+                        "id": str(reports[0]) if reports else "",
                         "schema_version": "v1",
                         "kind": "",
                         "ok": False,
@@ -1148,7 +1208,7 @@ def verify_command(
                 "summary": {"ok": False, "reason": reason},
                 "results": [
                     {
-                        "id": str(certificates[0]) if certificates else "",
+                        "id": str(reports[0]) if reports else "",
                         "schema_version": "v1",
                         "kind": "",
                         "ok": False,

invarlock/cli/config.py CHANGED Viewed

@@ -415,7 +415,7 @@ def _deep_merge_dicts(a: dict, b: dict) -> dict:  # pragma: no cover - trivial a
 def create_example_config() -> InvarLockConfig:  # pragma: no cover - test helper
     return InvarLockConfig(
-        model={"id": "gpt2", "adapter": "hf_gpt2", "device": "auto"},
+        model={"id": "gpt2", "adapter": "hf_causal", "device": "auto"},
         edit={"name": "quant_rtn", "plan": {}},
         dataset={"provider": "wikitext2", "seq_len": 512, "stride": 512},
         output={"dir": "runs"},

invarlock/cli/determinism.py CHANGED Viewed

@@ -5,7 +5,7 @@ Centralizes:
 - Thread caps (OMP/MKL/etc + torch threads)
 - TF32 policy
 - torch deterministic algorithms
-- A structured "determinism level" for certificate provenance
+- A structured "determinism level" for evaluation report provenance
 """
 from __future__ import annotations

invarlock 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

invarlock 0.3.6py3-none-any.whl → 0.3.8py3-none-any.whl