invarlock 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -117,6 +117,18 @@ def _validate_pairing(certificate: dict[str, Any]) -> list[str]:
117
117
 
118
118
  match_fraction = stats.get("window_match_fraction")
119
119
  overlap_fraction = stats.get("window_overlap_fraction")
120
+ pairing_reason = stats.get("window_pairing_reason")
121
+ paired_windows = _coerce_int(stats.get("paired_windows"))
122
+
123
+ if pairing_reason is not None:
124
+ errors.append(
125
+ "window_pairing_reason must be null/None for paired certificates "
126
+ f"(found {pairing_reason!r})."
127
+ )
128
+ if paired_windows is None:
129
+ errors.append("Certificate missing paired_windows metric.")
130
+ elif paired_windows == 0:
131
+ errors.append("paired_windows must be > 0 for paired certificates (found 0).")
120
132
 
121
133
  if match_fraction is None:
122
134
  errors.append("Certificate missing window_match_fraction metric.")
invarlock/cli/config.py CHANGED
@@ -207,11 +207,21 @@ def _create_loader(base_dir: Path):
207
207
  class Loader(yaml.SafeLoader):
208
208
  pass
209
209
 
210
- Loader._base_dir = Path(base_dir)
210
+ Loader._base_dir = Path(base_dir).resolve()
211
211
 
212
212
  def _construct_include(loader: yaml.SafeLoader, node: yaml.Node):
213
213
  rel = loader.construct_scalar(node)
214
214
  path = (loader._base_dir / rel).resolve()
215
+ allow_outside = os.environ.get("INVARLOCK_ALLOW_CONFIG_INCLUDE_OUTSIDE", "")
216
+ allow_outside = allow_outside.strip().lower() in {"1", "true", "yes", "on"}
217
+ if not allow_outside:
218
+ try:
219
+ path.relative_to(loader._base_dir)
220
+ except ValueError as exc:
221
+ raise ValueError(
222
+ "Config !include must stay within the config directory. "
223
+ "Set INVARLOCK_ALLOW_CONFIG_INCLUDE_OUTSIDE=1 to override."
224
+ ) from exc
215
225
  with path.open(encoding="utf-8") as fh:
216
226
  inc_loader = _create_loader(path.parent)
217
227
  return yaml.load(fh, Loader=inc_loader)
@@ -83,9 +83,24 @@ def apply_determinism_preset(
83
83
 
84
84
  # CUDA determinism: cuBLAS workspace config.
85
85
  if requested == "strict" and dev.startswith("cuda"):
86
- os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":16:8")
86
+ preferred = ":4096:8"
87
+ fallback = ":16:8"
88
+ if "CUBLAS_WORKSPACE_CONFIG" not in os.environ:
89
+ selected = preferred
90
+ if torch is not None:
91
+ try:
92
+ mem_bytes = int(torch.cuda.get_device_properties(0).total_memory)
93
+ if mem_bytes and mem_bytes < 8 * 1024**3:
94
+ selected = fallback
95
+ except Exception:
96
+ selected = preferred
97
+ os.environ["CUBLAS_WORKSPACE_CONFIG"] = selected
87
98
  env_set["CUBLAS_WORKSPACE_CONFIG"] = os.environ.get("CUBLAS_WORKSPACE_CONFIG")
88
99
 
100
+ if requested == "strict":
101
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
102
+ env_set["TOKENIZERS_PARALLELISM"] = os.environ.get("TOKENIZERS_PARALLELISM")
103
+
89
104
  # Seed all RNGs (python/numpy/torch) using the existing helper for parity.
90
105
  set_seed(int(seed))
91
106
 
@@ -39,6 +39,31 @@ def _ensure_array(samples: Iterable[float]) -> np.ndarray:
39
39
  return arr
40
40
 
41
41
 
42
+ def _normalize_weights(weights: Iterable[float] | None, n: int) -> np.ndarray | None:
43
+ if weights is None:
44
+ return None
45
+ arr = np.asarray(list(weights), dtype=float)
46
+ if arr.ndim != 1 or arr.size != n:
47
+ return None
48
+ if not np.all(np.isfinite(arr)):
49
+ return None
50
+ if np.any(arr < 0):
51
+ return None
52
+ total = float(arr.sum())
53
+ if total <= 0.0:
54
+ return None
55
+ if np.allclose(arr, arr[0]):
56
+ return None
57
+ return arr / total
58
+
59
+
60
+ def _weighted_mean(samples: np.ndarray, weights: np.ndarray) -> float:
61
+ total = float(weights.sum())
62
+ if total <= 0.0:
63
+ return float(np.mean(samples))
64
+ return float(np.dot(samples, weights) / total)
65
+
66
+
42
67
  def _percentile_interval(stats: np.ndarray, alpha: float) -> tuple[float, float]:
43
68
  """Return lower/upper bounds from an array of bootstrap statistics."""
44
69
  lower_q = 100.0 * (alpha / 2.0)
@@ -46,6 +71,61 @@ def _percentile_interval(stats: np.ndarray, alpha: float) -> tuple[float, float]
46
71
  return float(np.percentile(stats, lower_q)), float(np.percentile(stats, upper_q))
47
72
 
48
73
 
74
+ def _bca_interval_weighted(
75
+ samples: np.ndarray,
76
+ *,
77
+ weights: np.ndarray,
78
+ replicates: int,
79
+ alpha: float,
80
+ rng: np.random.Generator,
81
+ ) -> tuple[float, float]:
82
+ """Compute a BCa interval for the mean under weighted resampling."""
83
+ n = samples.size
84
+ if n < 2:
85
+ stat = _weighted_mean(samples, weights)
86
+ return float(stat), float(stat)
87
+
88
+ prob = weights / float(weights.sum())
89
+ stats = np.empty(replicates, dtype=float)
90
+ for i in range(replicates):
91
+ idx = rng.choice(n, size=n, replace=True, p=prob)
92
+ stats[i] = float(np.mean(samples[idx]))
93
+
94
+ stats.sort()
95
+ stat_hat = _weighted_mean(samples, weights)
96
+
97
+ prop = np.clip((stats < stat_hat).mean(), 1e-6, 1.0 - 1e-6)
98
+ z0 = Normal.inv_cdf(prop)
99
+
100
+ sum_w = float(weights.sum())
101
+ sum_wx = float(np.dot(samples, weights))
102
+ jack = np.empty(n, dtype=float)
103
+ for i in range(n):
104
+ w_i = float(weights[i])
105
+ denom = sum_w - w_i
106
+ if denom <= 0.0:
107
+ jack[i] = stat_hat
108
+ else:
109
+ jack[i] = (sum_wx - w_i * float(samples[i])) / denom
110
+
111
+ jack_mean = jack.mean()
112
+ numerator = np.sum((jack_mean - jack) ** 3)
113
+ denominator = 6.0 * (np.sum((jack_mean - jack) ** 2) ** 1.5)
114
+ if denominator == 0.0:
115
+ return _percentile_interval(stats, alpha)
116
+
117
+ acc = numerator / denominator
118
+
119
+ def _adjust_quantile(z_alpha: float) -> float:
120
+ adj = z0 + (z0 + z_alpha) / max(1.0 - acc * (z0 + z_alpha), 1e-12)
121
+ return float(Normal.cdf(adj))
122
+
123
+ lower_pct = _adjust_quantile(Normal.inv_cdf(alpha / 2.0))
124
+ upper_pct = _adjust_quantile(Normal.inv_cdf(1.0 - alpha / 2.0))
125
+
126
+ return float(np.quantile(stats, lower_pct)), float(np.quantile(stats, upper_pct))
127
+
128
+
49
129
  def _bca_interval(
50
130
  samples: np.ndarray,
51
131
  *,
@@ -104,6 +184,42 @@ def _bca_interval(
104
184
  return float(np.quantile(stats, lower_pct)), float(np.quantile(stats, upper_pct))
105
185
 
106
186
 
187
+ def _bootstrap_mean_ci_weighted(
188
+ samples: np.ndarray,
189
+ weights: np.ndarray,
190
+ *,
191
+ method: str,
192
+ replicates: int,
193
+ alpha: float,
194
+ seed: int,
195
+ ) -> tuple[float, float]:
196
+ if replicates <= 0:
197
+ raise ValueError("replicates must be positive")
198
+ if not 0.0 < alpha < 1.0:
199
+ raise ValueError("alpha must be between 0 and 1")
200
+
201
+ rng = np.random.default_rng(seed)
202
+ if method == "percentile":
203
+ stats = np.empty(replicates, dtype=float)
204
+ n = samples.size
205
+ prob = weights / float(weights.sum())
206
+ for i in range(replicates):
207
+ idx = rng.choice(n, size=n, replace=True, p=prob)
208
+ stats[i] = float(np.mean(samples[idx]))
209
+ stats.sort()
210
+ return _percentile_interval(stats, alpha)
211
+ if method == "bca":
212
+ return _bca_interval_weighted(
213
+ samples,
214
+ weights=weights,
215
+ replicates=replicates,
216
+ alpha=alpha,
217
+ rng=rng,
218
+ )
219
+
220
+ raise ValueError(f"Unsupported bootstrap method '{method}'")
221
+
222
+
107
223
  def _bootstrap_interval(
108
224
  samples: np.ndarray,
109
225
  *,
@@ -171,6 +287,7 @@ def compute_logloss_ci(
171
287
  def compute_paired_delta_log_ci(
172
288
  final_logloss: Iterable[float],
173
289
  baseline_logloss: Iterable[float],
290
+ weights: Iterable[float] | None = None,
174
291
  *,
175
292
  method: str = "bca",
176
293
  replicates: int = 1000,
@@ -180,15 +297,14 @@ def compute_paired_delta_log_ci(
180
297
  """
181
298
  Compute a confidence interval over the paired mean delta of log-loss.
182
299
 
183
- This implementation uses simple mean, which equals the token-weighted mean
184
- when all evaluation windows have equal token counts. The runner enforces
185
- `seq_len == stride` (non-overlapping windows) and `window_match_fraction == 1.0`
186
- (perfect pairing), so the equal-weight simplification applies. See
187
- docs/assurance/01-eval-math-proof.md for the full derivation.
300
+ This implementation uses token-weighted resampling when window weights are
301
+ provided. When all weights are equal, the weighted bootstrap reduces to the
302
+ simple mean. See docs/assurance/01-eval-math-proof.md for the derivation.
188
303
 
189
304
  Args:
190
305
  final_logloss: Iterable of per-window log-loss values after the edit/guard.
191
306
  baseline_logloss: Iterable of paired per-window log-loss values (before edit).
307
+ weights: Optional token counts per window; used for weighted resampling.
192
308
 
193
309
  Returns:
194
310
  (lo, hi) bounds of Δlog-loss such that ratio CI = exp(bounds).
@@ -199,6 +315,12 @@ def compute_paired_delta_log_ci(
199
315
  size = min(final_arr.size, base_arr.size)
200
316
  final_arr = final_arr[:size]
201
317
  base_arr = base_arr[:size]
318
+ weight_arr = None
319
+ if weights is not None:
320
+ weight_list = list(weights)
321
+ if len(weight_list) >= final_arr.size:
322
+ weight_list = weight_list[: final_arr.size]
323
+ weight_arr = _normalize_weights(weight_list, final_arr.size)
202
324
  if final_arr.size == 0:
203
325
  return 0.0, 0.0
204
326
 
@@ -207,6 +329,16 @@ def compute_paired_delta_log_ci(
207
329
  mean_delta = float(delta.mean())
208
330
  return mean_delta, mean_delta
209
331
 
332
+ if weight_arr is not None:
333
+ return _bootstrap_mean_ci_weighted(
334
+ delta,
335
+ weight_arr,
336
+ method=method,
337
+ replicates=replicates,
338
+ alpha=alpha,
339
+ seed=seed,
340
+ )
341
+
210
342
  def stat_fn(data: np.ndarray) -> float:
211
343
  return float(np.mean(data))
212
344