diff-diff 2.0.4__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
diff_diff/linalg.py ADDED
@@ -0,0 +1,980 @@
1
+ """
2
+ Unified linear algebra backend for diff-diff.
3
+
4
+ This module provides optimized OLS and variance estimation with an optional
5
+ Rust backend for maximum performance.
6
+
7
+ The key optimizations are:
8
+ 1. scipy.linalg.lstsq with 'gelsy' driver (QR-based, faster than SVD)
9
+ 2. Vectorized cluster-robust SE via groupby (eliminates O(n*clusters) loop)
10
+ 3. Single interface for all estimators (reduces code duplication)
11
+ 4. Optional Rust backend for additional speedup (when available)
12
+
13
+ The Rust backend is automatically used when available, with transparent
14
+ fallback to NumPy/SciPy implementations.
15
+ """
16
+
17
+ from dataclasses import dataclass
18
+ from typing import Dict, List, Optional, Tuple, Union
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ from scipy import stats
23
+ from scipy.linalg import lstsq as scipy_lstsq
24
+
25
+ # Import Rust backend if available (from _backend to avoid circular imports)
26
+ from diff_diff._backend import (
27
+ HAS_RUST_BACKEND,
28
+ _rust_compute_robust_vcov,
29
+ _rust_solve_ols,
30
+ )
31
+
32
+
33
+ def solve_ols(
34
+ X: np.ndarray,
35
+ y: np.ndarray,
36
+ *,
37
+ cluster_ids: Optional[np.ndarray] = None,
38
+ return_vcov: bool = True,
39
+ return_fitted: bool = False,
40
+ check_finite: bool = True,
41
+ ) -> Union[
42
+ Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
43
+ Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
44
+ ]:
45
+ """
46
+ Solve OLS regression with optional clustered standard errors.
47
+
48
+ This is the unified OLS solver for all diff-diff estimators. It uses
49
+ scipy's optimized LAPACK routines and vectorized variance estimation.
50
+
51
+ Parameters
52
+ ----------
53
+ X : ndarray of shape (n, k)
54
+ Design matrix (should include intercept if desired).
55
+ y : ndarray of shape (n,)
56
+ Response vector.
57
+ cluster_ids : ndarray of shape (n,), optional
58
+ Cluster identifiers for cluster-robust standard errors.
59
+ If None, HC1 (heteroskedasticity-robust) SEs are computed.
60
+ return_vcov : bool, default True
61
+ Whether to compute and return the variance-covariance matrix.
62
+ Set to False for faster computation when SEs are not needed.
63
+ return_fitted : bool, default False
64
+ Whether to return fitted values in addition to residuals.
65
+ check_finite : bool, default True
66
+ Whether to check that X and y contain only finite values (no NaN/Inf).
67
+ Set to False for faster computation if you are certain your data is clean.
68
+
69
+ Returns
70
+ -------
71
+ coefficients : ndarray of shape (k,)
72
+ OLS coefficient estimates.
73
+ residuals : ndarray of shape (n,)
74
+ Residuals (y - X @ coefficients).
75
+ fitted : ndarray of shape (n,), optional
76
+ Fitted values (X @ coefficients). Only returned if return_fitted=True.
77
+ vcov : ndarray of shape (k, k) or None
78
+ Variance-covariance matrix (HC1 or cluster-robust).
79
+ None if return_vcov=False.
80
+
81
+ Notes
82
+ -----
83
+ This function uses scipy.linalg.lstsq with the 'gelsy' driver, which is
84
+ QR-based and typically faster than NumPy's default SVD-based solver for
85
+ well-conditioned matrices.
86
+
87
+ The cluster-robust standard errors use the sandwich estimator with the
88
+ standard small-sample adjustment: (G/(G-1)) * ((n-1)/(n-k)).
89
+
90
+ Examples
91
+ --------
92
+ >>> import numpy as np
93
+ >>> from diff_diff.linalg import solve_ols
94
+ >>> X = np.column_stack([np.ones(100), np.random.randn(100)])
95
+ >>> y = 2 + 3 * X[:, 1] + np.random.randn(100)
96
+ >>> coef, resid, vcov = solve_ols(X, y)
97
+ >>> print(f"Intercept: {coef[0]:.2f}, Slope: {coef[1]:.2f}")
98
+ """
99
+ # Validate inputs
100
+ X = np.asarray(X, dtype=np.float64)
101
+ y = np.asarray(y, dtype=np.float64)
102
+
103
+ if X.ndim != 2:
104
+ raise ValueError(f"X must be 2-dimensional, got shape {X.shape}")
105
+ if y.ndim != 1:
106
+ raise ValueError(f"y must be 1-dimensional, got shape {y.shape}")
107
+ if X.shape[0] != y.shape[0]:
108
+ raise ValueError(
109
+ f"X and y must have same number of observations: "
110
+ f"{X.shape[0]} vs {y.shape[0]}"
111
+ )
112
+
113
+ n, k = X.shape
114
+ if n < k:
115
+ raise ValueError(
116
+ f"Fewer observations ({n}) than parameters ({k}). "
117
+ "Cannot solve underdetermined system."
118
+ )
119
+
120
+ # Check for NaN/Inf values if requested
121
+ if check_finite:
122
+ if not np.isfinite(X).all():
123
+ raise ValueError(
124
+ "X contains NaN or Inf values. "
125
+ "Clean your data or set check_finite=False to skip this check."
126
+ )
127
+ if not np.isfinite(y).all():
128
+ raise ValueError(
129
+ "y contains NaN or Inf values. "
130
+ "Clean your data or set check_finite=False to skip this check."
131
+ )
132
+
133
+ # Use Rust backend if available
134
+ # Note: Fall back to NumPy if check_finite=False since Rust's LAPACK
135
+ # doesn't support non-finite values
136
+ if HAS_RUST_BACKEND and check_finite:
137
+ # Ensure contiguous arrays for Rust
138
+ X = np.ascontiguousarray(X, dtype=np.float64)
139
+ y = np.ascontiguousarray(y, dtype=np.float64)
140
+
141
+ # Convert cluster_ids to int64 for Rust (if provided)
142
+ cluster_ids_int = None
143
+ if cluster_ids is not None:
144
+ cluster_ids_int = pd.factorize(cluster_ids)[0].astype(np.int64)
145
+
146
+ try:
147
+ coefficients, residuals, vcov = _rust_solve_ols(
148
+ X, y, cluster_ids_int, return_vcov
149
+ )
150
+ except ValueError as e:
151
+ # Translate Rust LAPACK errors to consistent Python error messages
152
+ error_msg = str(e)
153
+ if "Matrix inversion failed" in error_msg or "Least squares failed" in error_msg:
154
+ raise ValueError(
155
+ "Design matrix is rank-deficient (singular X'X matrix). "
156
+ "This indicates perfect multicollinearity. Check your fixed effects "
157
+ "and covariates for linear dependencies."
158
+ ) from e
159
+ raise
160
+
161
+ if return_fitted:
162
+ fitted = X @ coefficients
163
+ return coefficients, residuals, fitted, vcov
164
+ else:
165
+ return coefficients, residuals, vcov
166
+
167
+ # Fallback to NumPy/SciPy implementation
168
+ return _solve_ols_numpy(
169
+ X, y, cluster_ids=cluster_ids, return_vcov=return_vcov, return_fitted=return_fitted
170
+ )
171
+
172
+
173
+ def _solve_ols_numpy(
174
+ X: np.ndarray,
175
+ y: np.ndarray,
176
+ *,
177
+ cluster_ids: Optional[np.ndarray] = None,
178
+ return_vcov: bool = True,
179
+ return_fitted: bool = False,
180
+ ) -> Union[
181
+ Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
182
+ Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
183
+ ]:
184
+ """
185
+ NumPy/SciPy fallback implementation of solve_ols.
186
+
187
+ Uses scipy.linalg.lstsq with 'gelsy' driver (QR with column pivoting)
188
+ for numerically stable least squares solving. QR decomposition is preferred
189
+ over normal equations because it doesn't square the condition number of X,
190
+ making it more robust for ill-conditioned matrices common in DiD designs
191
+ (e.g., many unit/time fixed effects).
192
+
193
+ Parameters
194
+ ----------
195
+ X : np.ndarray
196
+ Design matrix of shape (n, k).
197
+ y : np.ndarray
198
+ Response vector of shape (n,).
199
+ cluster_ids : np.ndarray, optional
200
+ Cluster identifiers for cluster-robust SEs.
201
+ return_vcov : bool
202
+ Whether to compute variance-covariance matrix.
203
+ return_fitted : bool
204
+ Whether to return fitted values.
205
+
206
+ Returns
207
+ -------
208
+ coefficients : np.ndarray
209
+ OLS coefficients of shape (k,).
210
+ residuals : np.ndarray
211
+ Residuals of shape (n,).
212
+ fitted : np.ndarray, optional
213
+ Fitted values if return_fitted=True.
214
+ vcov : np.ndarray, optional
215
+ Variance-covariance matrix if return_vcov=True.
216
+ """
217
+ # Solve OLS using QR decomposition via scipy's optimized LAPACK routines
218
+ # 'gelsy' uses QR with column pivoting, which is numerically stable even
219
+ # for ill-conditioned matrices (doesn't square the condition number like
220
+ # normal equations would)
221
+ coefficients = scipy_lstsq(X, y, lapack_driver="gelsy", check_finite=False)[0]
222
+
223
+ # Compute residuals and fitted values
224
+ fitted = X @ coefficients
225
+ residuals = y - fitted
226
+
227
+ # Compute variance-covariance matrix if requested
228
+ vcov = None
229
+ if return_vcov:
230
+ vcov = _compute_robust_vcov_numpy(X, residuals, cluster_ids)
231
+
232
+ if return_fitted:
233
+ return coefficients, residuals, fitted, vcov
234
+ else:
235
+ return coefficients, residuals, vcov
236
+
237
+
238
+ def compute_robust_vcov(
239
+ X: np.ndarray,
240
+ residuals: np.ndarray,
241
+ cluster_ids: Optional[np.ndarray] = None,
242
+ ) -> np.ndarray:
243
+ """
244
+ Compute heteroskedasticity-robust or cluster-robust variance-covariance matrix.
245
+
246
+ Uses the sandwich estimator: (X'X)^{-1} * meat * (X'X)^{-1}
247
+
248
+ Parameters
249
+ ----------
250
+ X : ndarray of shape (n, k)
251
+ Design matrix.
252
+ residuals : ndarray of shape (n,)
253
+ OLS residuals.
254
+ cluster_ids : ndarray of shape (n,), optional
255
+ Cluster identifiers. If None, computes HC1 robust SEs.
256
+
257
+ Returns
258
+ -------
259
+ vcov : ndarray of shape (k, k)
260
+ Variance-covariance matrix.
261
+
262
+ Notes
263
+ -----
264
+ For HC1 (no clustering):
265
+ meat = X' * diag(u^2) * X
266
+ adjustment = n / (n - k)
267
+
268
+ For cluster-robust:
269
+ meat = sum_g (X_g' u_g)(X_g' u_g)'
270
+ adjustment = (G / (G-1)) * ((n-1) / (n-k))
271
+
272
+ The cluster-robust computation is vectorized using pandas groupby,
273
+ which is much faster than a Python loop over clusters.
274
+ """
275
+ # Use Rust backend if available
276
+ if HAS_RUST_BACKEND:
277
+ X = np.ascontiguousarray(X, dtype=np.float64)
278
+ residuals = np.ascontiguousarray(residuals, dtype=np.float64)
279
+
280
+ cluster_ids_int = None
281
+ if cluster_ids is not None:
282
+ cluster_ids_int = pd.factorize(cluster_ids)[0].astype(np.int64)
283
+
284
+ try:
285
+ return _rust_compute_robust_vcov(X, residuals, cluster_ids_int)
286
+ except ValueError as e:
287
+ # Translate Rust LAPACK errors to consistent Python error messages
288
+ error_msg = str(e)
289
+ if "Matrix inversion failed" in error_msg:
290
+ raise ValueError(
291
+ "Design matrix is rank-deficient (singular X'X matrix). "
292
+ "This indicates perfect multicollinearity. Check your fixed effects "
293
+ "and covariates for linear dependencies."
294
+ ) from e
295
+ raise
296
+
297
+ # Fallback to NumPy implementation
298
+ return _compute_robust_vcov_numpy(X, residuals, cluster_ids)
299
+
300
+
301
+ def _compute_robust_vcov_numpy(
302
+ X: np.ndarray,
303
+ residuals: np.ndarray,
304
+ cluster_ids: Optional[np.ndarray] = None,
305
+ ) -> np.ndarray:
306
+ """
307
+ NumPy fallback implementation of compute_robust_vcov.
308
+
309
+ Computes HC1 (heteroskedasticity-robust) or cluster-robust variance-covariance
310
+ matrix using the sandwich estimator.
311
+
312
+ Parameters
313
+ ----------
314
+ X : np.ndarray
315
+ Design matrix of shape (n, k).
316
+ residuals : np.ndarray
317
+ OLS residuals of shape (n,).
318
+ cluster_ids : np.ndarray, optional
319
+ Cluster identifiers. If None, uses HC1. If provided, uses
320
+ cluster-robust with G/(G-1) small-sample adjustment.
321
+
322
+ Returns
323
+ -------
324
+ vcov : np.ndarray
325
+ Variance-covariance matrix of shape (k, k).
326
+
327
+ Notes
328
+ -----
329
+ Uses vectorized groupby aggregation for cluster-robust SEs to avoid
330
+ the O(n * G) loop that would be required with explicit iteration.
331
+ """
332
+ n, k = X.shape
333
+ XtX = X.T @ X
334
+
335
+ if cluster_ids is None:
336
+ # HC1 (heteroskedasticity-robust) standard errors
337
+ adjustment = n / (n - k)
338
+ u_squared = residuals**2
339
+ # Vectorized meat computation: X' diag(u^2) X = (X * u^2)' X
340
+ meat = X.T @ (X * u_squared[:, np.newaxis])
341
+ else:
342
+ # Cluster-robust standard errors (vectorized via groupby)
343
+ cluster_ids = np.asarray(cluster_ids)
344
+ unique_clusters = np.unique(cluster_ids)
345
+ n_clusters = len(unique_clusters)
346
+
347
+ if n_clusters < 2:
348
+ raise ValueError(
349
+ f"Need at least 2 clusters for cluster-robust SEs, got {n_clusters}"
350
+ )
351
+
352
+ # Small-sample adjustment
353
+ adjustment = (n_clusters / (n_clusters - 1)) * ((n - 1) / (n - k))
354
+
355
+ # Compute cluster-level scores: sum of X_i * u_i within each cluster
356
+ # scores[i] = X[i] * residuals[i] for each observation
357
+ scores = X * residuals[:, np.newaxis] # (n, k)
358
+
359
+ # Sum scores within each cluster using pandas groupby (vectorized)
360
+ # This is much faster than looping over clusters
361
+ cluster_scores = pd.DataFrame(scores).groupby(cluster_ids).sum().values # (G, k)
362
+
363
+ # Meat is the outer product sum: sum_g (score_g)(score_g)'
364
+ # Equivalent to cluster_scores.T @ cluster_scores
365
+ meat = cluster_scores.T @ cluster_scores # (k, k)
366
+
367
+ # Sandwich estimator: (X'X)^{-1} meat (X'X)^{-1}
368
+ # Solve (X'X) temp = meat, then solve (X'X) vcov' = temp'
369
+ # More stable than explicit inverse
370
+ try:
371
+ temp = np.linalg.solve(XtX, meat)
372
+ vcov = adjustment * np.linalg.solve(XtX, temp.T).T
373
+ except np.linalg.LinAlgError as e:
374
+ if "Singular" in str(e):
375
+ raise ValueError(
376
+ "Design matrix is rank-deficient (singular X'X matrix). "
377
+ "This indicates perfect multicollinearity. Check your fixed effects "
378
+ "and covariates for linear dependencies."
379
+ ) from e
380
+ raise
381
+
382
+ return vcov
383
+
384
+
385
+ def compute_r_squared(
386
+ y: np.ndarray,
387
+ residuals: np.ndarray,
388
+ adjusted: bool = False,
389
+ n_params: int = 0,
390
+ ) -> float:
391
+ """
392
+ Compute R-squared or adjusted R-squared.
393
+
394
+ Parameters
395
+ ----------
396
+ y : ndarray of shape (n,)
397
+ Response vector.
398
+ residuals : ndarray of shape (n,)
399
+ OLS residuals.
400
+ adjusted : bool, default False
401
+ If True, compute adjusted R-squared.
402
+ n_params : int, default 0
403
+ Number of parameters (including intercept). Required if adjusted=True.
404
+
405
+ Returns
406
+ -------
407
+ r_squared : float
408
+ R-squared or adjusted R-squared.
409
+ """
410
+ ss_res = np.sum(residuals**2)
411
+ ss_tot = np.sum((y - np.mean(y)) ** 2)
412
+
413
+ if ss_tot == 0:
414
+ return 0.0
415
+
416
+ r_squared = 1 - (ss_res / ss_tot)
417
+
418
+ if adjusted:
419
+ n = len(y)
420
+ if n <= n_params:
421
+ return r_squared
422
+ r_squared = 1 - (1 - r_squared) * (n - 1) / (n - n_params)
423
+
424
+ return r_squared
425
+
426
+
427
+ # =============================================================================
428
+ # LinearRegression Helper Class
429
+ # =============================================================================
430
+
431
+
432
+ @dataclass
433
+ class InferenceResult:
434
+ """
435
+ Container for inference results on a single coefficient.
436
+
437
+ This dataclass provides a unified way to access coefficient estimates
438
+ and their associated inference statistics.
439
+
440
+ Attributes
441
+ ----------
442
+ coefficient : float
443
+ The point estimate of the coefficient.
444
+ se : float
445
+ Standard error of the coefficient.
446
+ t_stat : float
447
+ T-statistic (coefficient / se).
448
+ p_value : float
449
+ Two-sided p-value for the t-statistic.
450
+ conf_int : tuple of (float, float)
451
+ Confidence interval (lower, upper).
452
+ df : int or None
453
+ Degrees of freedom used for inference. None if using normal distribution.
454
+ alpha : float
455
+ Significance level used for confidence interval.
456
+
457
+ Examples
458
+ --------
459
+ >>> result = InferenceResult(
460
+ ... coefficient=2.5, se=0.5, t_stat=5.0, p_value=0.001,
461
+ ... conf_int=(1.52, 3.48), df=100, alpha=0.05
462
+ ... )
463
+ >>> result.is_significant()
464
+ True
465
+ >>> result.significance_stars()
466
+ '***'
467
+ """
468
+
469
+ coefficient: float
470
+ se: float
471
+ t_stat: float
472
+ p_value: float
473
+ conf_int: Tuple[float, float]
474
+ df: Optional[int] = None
475
+ alpha: float = 0.05
476
+
477
+ def is_significant(self, alpha: Optional[float] = None) -> bool:
478
+ """Check if the coefficient is statistically significant."""
479
+ threshold = alpha if alpha is not None else self.alpha
480
+ return self.p_value < threshold
481
+
482
+ def significance_stars(self) -> str:
483
+ """Return significance stars based on p-value."""
484
+ if self.p_value < 0.001:
485
+ return "***"
486
+ elif self.p_value < 0.01:
487
+ return "**"
488
+ elif self.p_value < 0.05:
489
+ return "*"
490
+ elif self.p_value < 0.1:
491
+ return "."
492
+ return ""
493
+
494
+ def to_dict(self) -> Dict[str, Union[float, Tuple[float, float], int, None]]:
495
+ """Convert to dictionary representation."""
496
+ return {
497
+ "coefficient": self.coefficient,
498
+ "se": self.se,
499
+ "t_stat": self.t_stat,
500
+ "p_value": self.p_value,
501
+ "conf_int": self.conf_int,
502
+ "df": self.df,
503
+ "alpha": self.alpha,
504
+ }
505
+
506
+
507
+ class LinearRegression:
508
+ """
509
+ OLS regression helper with unified coefficient extraction and inference.
510
+
511
+ This class wraps the low-level `solve_ols` function and provides a clean
512
+ interface for fitting regressions and extracting coefficient-level inference.
513
+ It eliminates code duplication across estimators by centralizing the common
514
+ pattern of: fit OLS -> extract coefficient -> compute SE -> compute t-stat
515
+ -> compute p-value -> compute CI.
516
+
517
+ Parameters
518
+ ----------
519
+ include_intercept : bool, default True
520
+ Whether to automatically add an intercept column to the design matrix.
521
+ robust : bool, default True
522
+ Whether to use heteroskedasticity-robust (HC1) standard errors.
523
+ If False and cluster_ids is None, uses classical OLS standard errors.
524
+ cluster_ids : array-like, optional
525
+ Cluster identifiers for cluster-robust standard errors.
526
+ Overrides the `robust` parameter if provided.
527
+ alpha : float, default 0.05
528
+ Significance level for confidence intervals.
529
+
530
+ Attributes
531
+ ----------
532
+ coefficients_ : ndarray
533
+ Fitted coefficient values (available after fit).
534
+ vcov_ : ndarray
535
+ Variance-covariance matrix (available after fit).
536
+ residuals_ : ndarray
537
+ Residuals from the fit (available after fit).
538
+ fitted_values_ : ndarray
539
+ Fitted values from the fit (available after fit).
540
+ n_obs_ : int
541
+ Number of observations (available after fit).
542
+ n_params_ : int
543
+ Number of parameters including intercept (available after fit).
544
+ df_ : int
545
+ Degrees of freedom (n - k) (available after fit).
546
+
547
+ Examples
548
+ --------
549
+ Basic usage with automatic intercept:
550
+
551
+ >>> import numpy as np
552
+ >>> from diff_diff.linalg import LinearRegression
553
+ >>> X = np.random.randn(100, 2)
554
+ >>> y = 1 + 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(100)
555
+ >>> reg = LinearRegression().fit(X, y)
556
+ >>> print(f"Intercept: {reg.coefficients_[0]:.2f}")
557
+ >>> inference = reg.get_inference(1) # inference for first predictor
558
+ >>> print(f"Coef: {inference.coefficient:.2f}, SE: {inference.se:.2f}")
559
+
560
+ Using with cluster-robust standard errors:
561
+
562
+ >>> cluster_ids = np.repeat(np.arange(20), 5) # 20 clusters of 5
563
+ >>> reg = LinearRegression(cluster_ids=cluster_ids).fit(X, y)
564
+ >>> inference = reg.get_inference(1)
565
+ >>> print(f"Cluster-robust SE: {inference.se:.2f}")
566
+
567
+ Extracting multiple coefficients at once:
568
+
569
+ >>> results = reg.get_inference_batch([1, 2])
570
+ >>> for idx, inf in results.items():
571
+ ... print(f"Coef {idx}: {inf.coefficient:.2f} ({inf.significance_stars()})")
572
+ """
573
+
574
+ def __init__(
575
+ self,
576
+ include_intercept: bool = True,
577
+ robust: bool = True,
578
+ cluster_ids: Optional[np.ndarray] = None,
579
+ alpha: float = 0.05,
580
+ ):
581
+ self.include_intercept = include_intercept
582
+ self.robust = robust
583
+ self.cluster_ids = cluster_ids
584
+ self.alpha = alpha
585
+
586
+ # Fitted attributes (set by fit())
587
+ self.coefficients_: Optional[np.ndarray] = None
588
+ self.vcov_: Optional[np.ndarray] = None
589
+ self.residuals_: Optional[np.ndarray] = None
590
+ self.fitted_values_: Optional[np.ndarray] = None
591
+ self._y: Optional[np.ndarray] = None
592
+ self._X: Optional[np.ndarray] = None
593
+ self.n_obs_: Optional[int] = None
594
+ self.n_params_: Optional[int] = None
595
+ self.df_: Optional[int] = None
596
+
597
+ def fit(
598
+ self,
599
+ X: np.ndarray,
600
+ y: np.ndarray,
601
+ *,
602
+ cluster_ids: Optional[np.ndarray] = None,
603
+ df_adjustment: int = 0,
604
+ ) -> "LinearRegression":
605
+ """
606
+ Fit OLS regression.
607
+
608
+ Parameters
609
+ ----------
610
+ X : ndarray of shape (n, k)
611
+ Design matrix. An intercept column will be added if include_intercept=True.
612
+ y : ndarray of shape (n,)
613
+ Response vector.
614
+ cluster_ids : ndarray, optional
615
+ Cluster identifiers for this fit. Overrides the instance-level
616
+ cluster_ids if provided.
617
+ df_adjustment : int, default 0
618
+ Additional degrees of freedom adjustment (e.g., for absorbed fixed effects).
619
+ The effective df will be n - k - df_adjustment.
620
+
621
+ Returns
622
+ -------
623
+ self : LinearRegression
624
+ Fitted estimator.
625
+ """
626
+ X = np.asarray(X, dtype=np.float64)
627
+ y = np.asarray(y, dtype=np.float64)
628
+
629
+ # Add intercept if requested
630
+ if self.include_intercept:
631
+ X = np.column_stack([np.ones(X.shape[0]), X])
632
+
633
+ # Use provided cluster_ids or fall back to instance-level
634
+ effective_cluster_ids = cluster_ids if cluster_ids is not None else self.cluster_ids
635
+
636
+ # Determine if we need robust/cluster vcov
637
+ compute_vcov = True
638
+
639
+ if self.robust or effective_cluster_ids is not None:
640
+ # Use solve_ols with robust/cluster SEs
641
+ coefficients, residuals, fitted, vcov = solve_ols(
642
+ X, y,
643
+ cluster_ids=effective_cluster_ids,
644
+ return_fitted=True,
645
+ return_vcov=compute_vcov,
646
+ )
647
+ else:
648
+ # Classical OLS - compute vcov separately
649
+ coefficients, residuals, fitted, _ = solve_ols(
650
+ X, y,
651
+ return_fitted=True,
652
+ return_vcov=False,
653
+ )
654
+ # Compute classical OLS variance-covariance matrix
655
+ n, k = X.shape
656
+ mse = np.sum(residuals**2) / (n - k)
657
+ try:
658
+ vcov = np.linalg.solve(X.T @ X, mse * np.eye(k))
659
+ except np.linalg.LinAlgError:
660
+ # Fall back to pseudo-inverse for singular matrices
661
+ vcov = np.linalg.pinv(X.T @ X) * mse
662
+
663
+ # Store fitted attributes
664
+ self.coefficients_ = coefficients
665
+ self.vcov_ = vcov
666
+ self.residuals_ = residuals
667
+ self.fitted_values_ = fitted
668
+ self._y = y
669
+ self._X = X
670
+ self.n_obs_ = X.shape[0]
671
+ self.n_params_ = X.shape[1]
672
+ self.df_ = self.n_obs_ - self.n_params_ - df_adjustment
673
+
674
+ return self
675
+
676
+ def _check_fitted(self) -> None:
677
+ """Raise error if model has not been fitted."""
678
+ if self.coefficients_ is None:
679
+ raise ValueError("Model has not been fitted. Call fit() first.")
680
+
681
+ def get_coefficient(self, index: int) -> float:
682
+ """
683
+ Get the coefficient value at a specific index.
684
+
685
+ Parameters
686
+ ----------
687
+ index : int
688
+ Index of the coefficient in the coefficient array.
689
+
690
+ Returns
691
+ -------
692
+ float
693
+ Coefficient value.
694
+ """
695
+ self._check_fitted()
696
+ return float(self.coefficients_[index])
697
+
698
+ def get_se(self, index: int) -> float:
699
+ """
700
+ Get the standard error for a coefficient.
701
+
702
+ Parameters
703
+ ----------
704
+ index : int
705
+ Index of the coefficient.
706
+
707
+ Returns
708
+ -------
709
+ float
710
+ Standard error.
711
+ """
712
+ self._check_fitted()
713
+ return float(np.sqrt(self.vcov_[index, index]))
714
+
715
+ def get_inference(
716
+ self,
717
+ index: int,
718
+ alpha: Optional[float] = None,
719
+ df: Optional[int] = None,
720
+ ) -> InferenceResult:
721
+ """
722
+ Get full inference results for a coefficient.
723
+
724
+ This is the primary method for extracting coefficient-level inference,
725
+ returning all statistics in a single call.
726
+
727
+ Parameters
728
+ ----------
729
+ index : int
730
+ Index of the coefficient in the coefficient array.
731
+ alpha : float, optional
732
+ Significance level for CI. Defaults to instance-level alpha.
733
+ df : int, optional
734
+ Degrees of freedom. Defaults to fitted df (n - k - df_adjustment).
735
+ Set to None explicitly to use normal distribution instead of t.
736
+
737
+ Returns
738
+ -------
739
+ InferenceResult
740
+ Dataclass containing coefficient, se, t_stat, p_value, conf_int.
741
+
742
+ Examples
743
+ --------
744
+ >>> reg = LinearRegression().fit(X, y)
745
+ >>> result = reg.get_inference(1)
746
+ >>> print(f"Effect: {result.coefficient:.3f} (SE: {result.se:.3f})")
747
+ >>> print(f"95% CI: [{result.conf_int[0]:.3f}, {result.conf_int[1]:.3f}]")
748
+ >>> if result.is_significant():
749
+ ... print("Statistically significant!")
750
+ """
751
+ self._check_fitted()
752
+
753
+ coef = float(self.coefficients_[index])
754
+ se = float(np.sqrt(self.vcov_[index, index]))
755
+
756
+ # Handle zero or negative SE (indicates perfect fit or numerical issues)
757
+ if se <= 0:
758
+ import warnings
759
+ warnings.warn(
760
+ f"Standard error is zero or negative (se={se}) for coefficient at index {index}. "
761
+ "This may indicate perfect multicollinearity or numerical issues.",
762
+ UserWarning,
763
+ )
764
+ # Use inf for t-stat when SE is zero (perfect fit scenario)
765
+ if coef > 0:
766
+ t_stat = np.inf
767
+ elif coef < 0:
768
+ t_stat = -np.inf
769
+ else:
770
+ t_stat = 0.0
771
+ else:
772
+ t_stat = coef / se
773
+
774
+ # Use instance alpha if not provided
775
+ effective_alpha = alpha if alpha is not None else self.alpha
776
+
777
+ # Use fitted df if not explicitly provided
778
+ # Note: df=None means use normal distribution
779
+ effective_df = df if df is not None else self.df_
780
+
781
+ # Warn if df is non-positive and fall back to normal distribution
782
+ if effective_df is not None and effective_df <= 0:
783
+ import warnings
784
+ warnings.warn(
785
+ f"Degrees of freedom is non-positive (df={effective_df}). "
786
+ "Using normal distribution instead of t-distribution for inference.",
787
+ UserWarning,
788
+ )
789
+ effective_df = None
790
+
791
+ # Compute p-value
792
+ p_value = _compute_p_value(t_stat, df=effective_df)
793
+
794
+ # Compute confidence interval
795
+ conf_int = _compute_confidence_interval(coef, se, effective_alpha, df=effective_df)
796
+
797
+ return InferenceResult(
798
+ coefficient=coef,
799
+ se=se,
800
+ t_stat=t_stat,
801
+ p_value=p_value,
802
+ conf_int=conf_int,
803
+ df=effective_df,
804
+ alpha=effective_alpha,
805
+ )
806
+
807
+ def get_inference_batch(
808
+ self,
809
+ indices: List[int],
810
+ alpha: Optional[float] = None,
811
+ df: Optional[int] = None,
812
+ ) -> Dict[int, InferenceResult]:
813
+ """
814
+ Get inference results for multiple coefficients.
815
+
816
+ Parameters
817
+ ----------
818
+ indices : list of int
819
+ Indices of coefficients to extract.
820
+ alpha : float, optional
821
+ Significance level for CIs. Defaults to instance-level alpha.
822
+ df : int, optional
823
+ Degrees of freedom. Defaults to fitted df.
824
+
825
+ Returns
826
+ -------
827
+ dict
828
+ Dictionary mapping index -> InferenceResult.
829
+
830
+ Examples
831
+ --------
832
+ >>> reg = LinearRegression().fit(X, y)
833
+ >>> results = reg.get_inference_batch([1, 2, 3])
834
+ >>> for idx, inf in results.items():
835
+ ... print(f"Coef {idx}: {inf.coefficient:.3f} {inf.significance_stars()}")
836
+ """
837
+ self._check_fitted()
838
+ return {idx: self.get_inference(idx, alpha=alpha, df=df) for idx in indices}
839
+
840
+ def get_all_inference(
841
+ self,
842
+ alpha: Optional[float] = None,
843
+ df: Optional[int] = None,
844
+ ) -> List[InferenceResult]:
845
+ """
846
+ Get inference results for all coefficients.
847
+
848
+ Parameters
849
+ ----------
850
+ alpha : float, optional
851
+ Significance level for CIs. Defaults to instance-level alpha.
852
+ df : int, optional
853
+ Degrees of freedom. Defaults to fitted df.
854
+
855
+ Returns
856
+ -------
857
+ list of InferenceResult
858
+ Inference results for each coefficient in order.
859
+ """
860
+ self._check_fitted()
861
+ return [
862
+ self.get_inference(i, alpha=alpha, df=df)
863
+ for i in range(len(self.coefficients_))
864
+ ]
865
+
866
+ def r_squared(self, adjusted: bool = False) -> float:
867
+ """
868
+ Compute R-squared or adjusted R-squared.
869
+
870
+ Parameters
871
+ ----------
872
+ adjusted : bool, default False
873
+ If True, return adjusted R-squared.
874
+
875
+ Returns
876
+ -------
877
+ float
878
+ R-squared value.
879
+ """
880
+ self._check_fitted()
881
+ return compute_r_squared(
882
+ self._y, self.residuals_, adjusted=adjusted, n_params=self.n_params_
883
+ )
884
+
885
+ def predict(self, X: np.ndarray) -> np.ndarray:
886
+ """
887
+ Predict using the fitted model.
888
+
889
+ Parameters
890
+ ----------
891
+ X : ndarray of shape (n, k)
892
+ Design matrix for prediction. Should have same number of columns
893
+ as the original X (excluding intercept if include_intercept=True).
894
+
895
+ Returns
896
+ -------
897
+ ndarray
898
+ Predicted values.
899
+ """
900
+ self._check_fitted()
901
+ X = np.asarray(X, dtype=np.float64)
902
+
903
+ if self.include_intercept:
904
+ X = np.column_stack([np.ones(X.shape[0]), X])
905
+
906
+ return X @ self.coefficients_
907
+
908
+
909
+ # =============================================================================
910
+ # Internal helpers for inference (used by LinearRegression)
911
+ # =============================================================================
912
+
913
+
914
+ def _compute_p_value(
915
+ t_stat: float,
916
+ df: Optional[int] = None,
917
+ two_sided: bool = True,
918
+ ) -> float:
919
+ """
920
+ Compute p-value for a t-statistic.
921
+
922
+ Parameters
923
+ ----------
924
+ t_stat : float
925
+ T-statistic.
926
+ df : int, optional
927
+ Degrees of freedom. If None, uses normal distribution.
928
+ two_sided : bool, default True
929
+ Whether to compute two-sided p-value.
930
+
931
+ Returns
932
+ -------
933
+ float
934
+ P-value.
935
+ """
936
+ if df is not None and df > 0:
937
+ p_value = stats.t.sf(np.abs(t_stat), df)
938
+ else:
939
+ p_value = stats.norm.sf(np.abs(t_stat))
940
+
941
+ if two_sided:
942
+ p_value *= 2
943
+
944
+ return float(p_value)
945
+
946
+
947
+ def _compute_confidence_interval(
948
+ estimate: float,
949
+ se: float,
950
+ alpha: float = 0.05,
951
+ df: Optional[int] = None,
952
+ ) -> Tuple[float, float]:
953
+ """
954
+ Compute confidence interval for an estimate.
955
+
956
+ Parameters
957
+ ----------
958
+ estimate : float
959
+ Point estimate.
960
+ se : float
961
+ Standard error.
962
+ alpha : float, default 0.05
963
+ Significance level (0.05 for 95% CI).
964
+ df : int, optional
965
+ Degrees of freedom. If None, uses normal distribution.
966
+
967
+ Returns
968
+ -------
969
+ tuple of (float, float)
970
+ (lower_bound, upper_bound) of confidence interval.
971
+ """
972
+ if df is not None and df > 0:
973
+ critical_value = stats.t.ppf(1 - alpha / 2, df)
974
+ else:
975
+ critical_value = stats.norm.ppf(1 - alpha / 2)
976
+
977
+ lower = estimate - critical_value * se
978
+ upper = estimate + critical_value * se
979
+
980
+ return (lower, upper)