diff-diff 2.1.0__cp39-cp39-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diff_diff/__init__.py +234 -0
- diff_diff/_backend.py +64 -0
- diff_diff/_rust_backend.cpython-39-darwin.so +0 -0
- diff_diff/bacon.py +979 -0
- diff_diff/datasets.py +708 -0
- diff_diff/diagnostics.py +927 -0
- diff_diff/estimators.py +1000 -0
- diff_diff/honest_did.py +1493 -0
- diff_diff/linalg.py +980 -0
- diff_diff/power.py +1350 -0
- diff_diff/prep.py +1338 -0
- diff_diff/pretrends.py +1067 -0
- diff_diff/results.py +703 -0
- diff_diff/staggered.py +2297 -0
- diff_diff/sun_abraham.py +1176 -0
- diff_diff/synthetic_did.py +738 -0
- diff_diff/triple_diff.py +1291 -0
- diff_diff/trop.py +1348 -0
- diff_diff/twfe.py +344 -0
- diff_diff/utils.py +1481 -0
- diff_diff/visualization.py +1627 -0
- diff_diff-2.1.0.dist-info/METADATA +2511 -0
- diff_diff-2.1.0.dist-info/RECORD +24 -0
- diff_diff-2.1.0.dist-info/WHEEL +4 -0
diff_diff/linalg.py
ADDED
|
@@ -0,0 +1,980 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified linear algebra backend for diff-diff.
|
|
3
|
+
|
|
4
|
+
This module provides optimized OLS and variance estimation with an optional
|
|
5
|
+
Rust backend for maximum performance.
|
|
6
|
+
|
|
7
|
+
The key optimizations are:
|
|
8
|
+
1. scipy.linalg.lstsq with 'gelsy' driver (QR-based, faster than SVD)
|
|
9
|
+
2. Vectorized cluster-robust SE via groupby (eliminates O(n*clusters) loop)
|
|
10
|
+
3. Single interface for all estimators (reduces code duplication)
|
|
11
|
+
4. Optional Rust backend for additional speedup (when available)
|
|
12
|
+
|
|
13
|
+
The Rust backend is automatically used when available, with transparent
|
|
14
|
+
fallback to NumPy/SciPy implementations.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
from scipy import stats
|
|
23
|
+
from scipy.linalg import lstsq as scipy_lstsq
|
|
24
|
+
|
|
25
|
+
# Import Rust backend if available (from _backend to avoid circular imports)
|
|
26
|
+
from diff_diff._backend import (
|
|
27
|
+
HAS_RUST_BACKEND,
|
|
28
|
+
_rust_compute_robust_vcov,
|
|
29
|
+
_rust_solve_ols,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def solve_ols(
|
|
34
|
+
X: np.ndarray,
|
|
35
|
+
y: np.ndarray,
|
|
36
|
+
*,
|
|
37
|
+
cluster_ids: Optional[np.ndarray] = None,
|
|
38
|
+
return_vcov: bool = True,
|
|
39
|
+
return_fitted: bool = False,
|
|
40
|
+
check_finite: bool = True,
|
|
41
|
+
) -> Union[
|
|
42
|
+
Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
|
|
43
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
|
|
44
|
+
]:
|
|
45
|
+
"""
|
|
46
|
+
Solve OLS regression with optional clustered standard errors.
|
|
47
|
+
|
|
48
|
+
This is the unified OLS solver for all diff-diff estimators. It uses
|
|
49
|
+
scipy's optimized LAPACK routines and vectorized variance estimation.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
X : ndarray of shape (n, k)
|
|
54
|
+
Design matrix (should include intercept if desired).
|
|
55
|
+
y : ndarray of shape (n,)
|
|
56
|
+
Response vector.
|
|
57
|
+
cluster_ids : ndarray of shape (n,), optional
|
|
58
|
+
Cluster identifiers for cluster-robust standard errors.
|
|
59
|
+
If None, HC1 (heteroskedasticity-robust) SEs are computed.
|
|
60
|
+
return_vcov : bool, default True
|
|
61
|
+
Whether to compute and return the variance-covariance matrix.
|
|
62
|
+
Set to False for faster computation when SEs are not needed.
|
|
63
|
+
return_fitted : bool, default False
|
|
64
|
+
Whether to return fitted values in addition to residuals.
|
|
65
|
+
check_finite : bool, default True
|
|
66
|
+
Whether to check that X and y contain only finite values (no NaN/Inf).
|
|
67
|
+
Set to False for faster computation if you are certain your data is clean.
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
coefficients : ndarray of shape (k,)
|
|
72
|
+
OLS coefficient estimates.
|
|
73
|
+
residuals : ndarray of shape (n,)
|
|
74
|
+
Residuals (y - X @ coefficients).
|
|
75
|
+
fitted : ndarray of shape (n,), optional
|
|
76
|
+
Fitted values (X @ coefficients). Only returned if return_fitted=True.
|
|
77
|
+
vcov : ndarray of shape (k, k) or None
|
|
78
|
+
Variance-covariance matrix (HC1 or cluster-robust).
|
|
79
|
+
None if return_vcov=False.
|
|
80
|
+
|
|
81
|
+
Notes
|
|
82
|
+
-----
|
|
83
|
+
This function uses scipy.linalg.lstsq with the 'gelsy' driver, which is
|
|
84
|
+
QR-based and typically faster than NumPy's default SVD-based solver for
|
|
85
|
+
well-conditioned matrices.
|
|
86
|
+
|
|
87
|
+
The cluster-robust standard errors use the sandwich estimator with the
|
|
88
|
+
standard small-sample adjustment: (G/(G-1)) * ((n-1)/(n-k)).
|
|
89
|
+
|
|
90
|
+
Examples
|
|
91
|
+
--------
|
|
92
|
+
>>> import numpy as np
|
|
93
|
+
>>> from diff_diff.linalg import solve_ols
|
|
94
|
+
>>> X = np.column_stack([np.ones(100), np.random.randn(100)])
|
|
95
|
+
>>> y = 2 + 3 * X[:, 1] + np.random.randn(100)
|
|
96
|
+
>>> coef, resid, vcov = solve_ols(X, y)
|
|
97
|
+
>>> print(f"Intercept: {coef[0]:.2f}, Slope: {coef[1]:.2f}")
|
|
98
|
+
"""
|
|
99
|
+
# Validate inputs
|
|
100
|
+
X = np.asarray(X, dtype=np.float64)
|
|
101
|
+
y = np.asarray(y, dtype=np.float64)
|
|
102
|
+
|
|
103
|
+
if X.ndim != 2:
|
|
104
|
+
raise ValueError(f"X must be 2-dimensional, got shape {X.shape}")
|
|
105
|
+
if y.ndim != 1:
|
|
106
|
+
raise ValueError(f"y must be 1-dimensional, got shape {y.shape}")
|
|
107
|
+
if X.shape[0] != y.shape[0]:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"X and y must have same number of observations: "
|
|
110
|
+
f"{X.shape[0]} vs {y.shape[0]}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
n, k = X.shape
|
|
114
|
+
if n < k:
|
|
115
|
+
raise ValueError(
|
|
116
|
+
f"Fewer observations ({n}) than parameters ({k}). "
|
|
117
|
+
"Cannot solve underdetermined system."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Check for NaN/Inf values if requested
|
|
121
|
+
if check_finite:
|
|
122
|
+
if not np.isfinite(X).all():
|
|
123
|
+
raise ValueError(
|
|
124
|
+
"X contains NaN or Inf values. "
|
|
125
|
+
"Clean your data or set check_finite=False to skip this check."
|
|
126
|
+
)
|
|
127
|
+
if not np.isfinite(y).all():
|
|
128
|
+
raise ValueError(
|
|
129
|
+
"y contains NaN or Inf values. "
|
|
130
|
+
"Clean your data or set check_finite=False to skip this check."
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Use Rust backend if available
|
|
134
|
+
# Note: Fall back to NumPy if check_finite=False since Rust's LAPACK
|
|
135
|
+
# doesn't support non-finite values
|
|
136
|
+
if HAS_RUST_BACKEND and check_finite:
|
|
137
|
+
# Ensure contiguous arrays for Rust
|
|
138
|
+
X = np.ascontiguousarray(X, dtype=np.float64)
|
|
139
|
+
y = np.ascontiguousarray(y, dtype=np.float64)
|
|
140
|
+
|
|
141
|
+
# Convert cluster_ids to int64 for Rust (if provided)
|
|
142
|
+
cluster_ids_int = None
|
|
143
|
+
if cluster_ids is not None:
|
|
144
|
+
cluster_ids_int = pd.factorize(cluster_ids)[0].astype(np.int64)
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
coefficients, residuals, vcov = _rust_solve_ols(
|
|
148
|
+
X, y, cluster_ids_int, return_vcov
|
|
149
|
+
)
|
|
150
|
+
except ValueError as e:
|
|
151
|
+
# Translate Rust LAPACK errors to consistent Python error messages
|
|
152
|
+
error_msg = str(e)
|
|
153
|
+
if "Matrix inversion failed" in error_msg or "Least squares failed" in error_msg:
|
|
154
|
+
raise ValueError(
|
|
155
|
+
"Design matrix is rank-deficient (singular X'X matrix). "
|
|
156
|
+
"This indicates perfect multicollinearity. Check your fixed effects "
|
|
157
|
+
"and covariates for linear dependencies."
|
|
158
|
+
) from e
|
|
159
|
+
raise
|
|
160
|
+
|
|
161
|
+
if return_fitted:
|
|
162
|
+
fitted = X @ coefficients
|
|
163
|
+
return coefficients, residuals, fitted, vcov
|
|
164
|
+
else:
|
|
165
|
+
return coefficients, residuals, vcov
|
|
166
|
+
|
|
167
|
+
# Fallback to NumPy/SciPy implementation
|
|
168
|
+
return _solve_ols_numpy(
|
|
169
|
+
X, y, cluster_ids=cluster_ids, return_vcov=return_vcov, return_fitted=return_fitted
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _solve_ols_numpy(
|
|
174
|
+
X: np.ndarray,
|
|
175
|
+
y: np.ndarray,
|
|
176
|
+
*,
|
|
177
|
+
cluster_ids: Optional[np.ndarray] = None,
|
|
178
|
+
return_vcov: bool = True,
|
|
179
|
+
return_fitted: bool = False,
|
|
180
|
+
) -> Union[
|
|
181
|
+
Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
|
|
182
|
+
Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
|
|
183
|
+
]:
|
|
184
|
+
"""
|
|
185
|
+
NumPy/SciPy fallback implementation of solve_ols.
|
|
186
|
+
|
|
187
|
+
Uses scipy.linalg.lstsq with 'gelsy' driver (QR with column pivoting)
|
|
188
|
+
for numerically stable least squares solving. QR decomposition is preferred
|
|
189
|
+
over normal equations because it doesn't square the condition number of X,
|
|
190
|
+
making it more robust for ill-conditioned matrices common in DiD designs
|
|
191
|
+
(e.g., many unit/time fixed effects).
|
|
192
|
+
|
|
193
|
+
Parameters
|
|
194
|
+
----------
|
|
195
|
+
X : np.ndarray
|
|
196
|
+
Design matrix of shape (n, k).
|
|
197
|
+
y : np.ndarray
|
|
198
|
+
Response vector of shape (n,).
|
|
199
|
+
cluster_ids : np.ndarray, optional
|
|
200
|
+
Cluster identifiers for cluster-robust SEs.
|
|
201
|
+
return_vcov : bool
|
|
202
|
+
Whether to compute variance-covariance matrix.
|
|
203
|
+
return_fitted : bool
|
|
204
|
+
Whether to return fitted values.
|
|
205
|
+
|
|
206
|
+
Returns
|
|
207
|
+
-------
|
|
208
|
+
coefficients : np.ndarray
|
|
209
|
+
OLS coefficients of shape (k,).
|
|
210
|
+
residuals : np.ndarray
|
|
211
|
+
Residuals of shape (n,).
|
|
212
|
+
fitted : np.ndarray, optional
|
|
213
|
+
Fitted values if return_fitted=True.
|
|
214
|
+
vcov : np.ndarray, optional
|
|
215
|
+
Variance-covariance matrix if return_vcov=True.
|
|
216
|
+
"""
|
|
217
|
+
# Solve OLS using QR decomposition via scipy's optimized LAPACK routines
|
|
218
|
+
# 'gelsy' uses QR with column pivoting, which is numerically stable even
|
|
219
|
+
# for ill-conditioned matrices (doesn't square the condition number like
|
|
220
|
+
# normal equations would)
|
|
221
|
+
coefficients = scipy_lstsq(X, y, lapack_driver="gelsy", check_finite=False)[0]
|
|
222
|
+
|
|
223
|
+
# Compute residuals and fitted values
|
|
224
|
+
fitted = X @ coefficients
|
|
225
|
+
residuals = y - fitted
|
|
226
|
+
|
|
227
|
+
# Compute variance-covariance matrix if requested
|
|
228
|
+
vcov = None
|
|
229
|
+
if return_vcov:
|
|
230
|
+
vcov = _compute_robust_vcov_numpy(X, residuals, cluster_ids)
|
|
231
|
+
|
|
232
|
+
if return_fitted:
|
|
233
|
+
return coefficients, residuals, fitted, vcov
|
|
234
|
+
else:
|
|
235
|
+
return coefficients, residuals, vcov
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def compute_robust_vcov(
|
|
239
|
+
X: np.ndarray,
|
|
240
|
+
residuals: np.ndarray,
|
|
241
|
+
cluster_ids: Optional[np.ndarray] = None,
|
|
242
|
+
) -> np.ndarray:
|
|
243
|
+
"""
|
|
244
|
+
Compute heteroskedasticity-robust or cluster-robust variance-covariance matrix.
|
|
245
|
+
|
|
246
|
+
Uses the sandwich estimator: (X'X)^{-1} * meat * (X'X)^{-1}
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
X : ndarray of shape (n, k)
|
|
251
|
+
Design matrix.
|
|
252
|
+
residuals : ndarray of shape (n,)
|
|
253
|
+
OLS residuals.
|
|
254
|
+
cluster_ids : ndarray of shape (n,), optional
|
|
255
|
+
Cluster identifiers. If None, computes HC1 robust SEs.
|
|
256
|
+
|
|
257
|
+
Returns
|
|
258
|
+
-------
|
|
259
|
+
vcov : ndarray of shape (k, k)
|
|
260
|
+
Variance-covariance matrix.
|
|
261
|
+
|
|
262
|
+
Notes
|
|
263
|
+
-----
|
|
264
|
+
For HC1 (no clustering):
|
|
265
|
+
meat = X' * diag(u^2) * X
|
|
266
|
+
adjustment = n / (n - k)
|
|
267
|
+
|
|
268
|
+
For cluster-robust:
|
|
269
|
+
meat = sum_g (X_g' u_g)(X_g' u_g)'
|
|
270
|
+
adjustment = (G / (G-1)) * ((n-1) / (n-k))
|
|
271
|
+
|
|
272
|
+
The cluster-robust computation is vectorized using pandas groupby,
|
|
273
|
+
which is much faster than a Python loop over clusters.
|
|
274
|
+
"""
|
|
275
|
+
# Use Rust backend if available
|
|
276
|
+
if HAS_RUST_BACKEND:
|
|
277
|
+
X = np.ascontiguousarray(X, dtype=np.float64)
|
|
278
|
+
residuals = np.ascontiguousarray(residuals, dtype=np.float64)
|
|
279
|
+
|
|
280
|
+
cluster_ids_int = None
|
|
281
|
+
if cluster_ids is not None:
|
|
282
|
+
cluster_ids_int = pd.factorize(cluster_ids)[0].astype(np.int64)
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
return _rust_compute_robust_vcov(X, residuals, cluster_ids_int)
|
|
286
|
+
except ValueError as e:
|
|
287
|
+
# Translate Rust LAPACK errors to consistent Python error messages
|
|
288
|
+
error_msg = str(e)
|
|
289
|
+
if "Matrix inversion failed" in error_msg:
|
|
290
|
+
raise ValueError(
|
|
291
|
+
"Design matrix is rank-deficient (singular X'X matrix). "
|
|
292
|
+
"This indicates perfect multicollinearity. Check your fixed effects "
|
|
293
|
+
"and covariates for linear dependencies."
|
|
294
|
+
) from e
|
|
295
|
+
raise
|
|
296
|
+
|
|
297
|
+
# Fallback to NumPy implementation
|
|
298
|
+
return _compute_robust_vcov_numpy(X, residuals, cluster_ids)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _compute_robust_vcov_numpy(
|
|
302
|
+
X: np.ndarray,
|
|
303
|
+
residuals: np.ndarray,
|
|
304
|
+
cluster_ids: Optional[np.ndarray] = None,
|
|
305
|
+
) -> np.ndarray:
|
|
306
|
+
"""
|
|
307
|
+
NumPy fallback implementation of compute_robust_vcov.
|
|
308
|
+
|
|
309
|
+
Computes HC1 (heteroskedasticity-robust) or cluster-robust variance-covariance
|
|
310
|
+
matrix using the sandwich estimator.
|
|
311
|
+
|
|
312
|
+
Parameters
|
|
313
|
+
----------
|
|
314
|
+
X : np.ndarray
|
|
315
|
+
Design matrix of shape (n, k).
|
|
316
|
+
residuals : np.ndarray
|
|
317
|
+
OLS residuals of shape (n,).
|
|
318
|
+
cluster_ids : np.ndarray, optional
|
|
319
|
+
Cluster identifiers. If None, uses HC1. If provided, uses
|
|
320
|
+
cluster-robust with G/(G-1) small-sample adjustment.
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
vcov : np.ndarray
|
|
325
|
+
Variance-covariance matrix of shape (k, k).
|
|
326
|
+
|
|
327
|
+
Notes
|
|
328
|
+
-----
|
|
329
|
+
Uses vectorized groupby aggregation for cluster-robust SEs to avoid
|
|
330
|
+
the O(n * G) loop that would be required with explicit iteration.
|
|
331
|
+
"""
|
|
332
|
+
n, k = X.shape
|
|
333
|
+
XtX = X.T @ X
|
|
334
|
+
|
|
335
|
+
if cluster_ids is None:
|
|
336
|
+
# HC1 (heteroskedasticity-robust) standard errors
|
|
337
|
+
adjustment = n / (n - k)
|
|
338
|
+
u_squared = residuals**2
|
|
339
|
+
# Vectorized meat computation: X' diag(u^2) X = (X * u^2)' X
|
|
340
|
+
meat = X.T @ (X * u_squared[:, np.newaxis])
|
|
341
|
+
else:
|
|
342
|
+
# Cluster-robust standard errors (vectorized via groupby)
|
|
343
|
+
cluster_ids = np.asarray(cluster_ids)
|
|
344
|
+
unique_clusters = np.unique(cluster_ids)
|
|
345
|
+
n_clusters = len(unique_clusters)
|
|
346
|
+
|
|
347
|
+
if n_clusters < 2:
|
|
348
|
+
raise ValueError(
|
|
349
|
+
f"Need at least 2 clusters for cluster-robust SEs, got {n_clusters}"
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
# Small-sample adjustment
|
|
353
|
+
adjustment = (n_clusters / (n_clusters - 1)) * ((n - 1) / (n - k))
|
|
354
|
+
|
|
355
|
+
# Compute cluster-level scores: sum of X_i * u_i within each cluster
|
|
356
|
+
# scores[i] = X[i] * residuals[i] for each observation
|
|
357
|
+
scores = X * residuals[:, np.newaxis] # (n, k)
|
|
358
|
+
|
|
359
|
+
# Sum scores within each cluster using pandas groupby (vectorized)
|
|
360
|
+
# This is much faster than looping over clusters
|
|
361
|
+
cluster_scores = pd.DataFrame(scores).groupby(cluster_ids).sum().values # (G, k)
|
|
362
|
+
|
|
363
|
+
# Meat is the outer product sum: sum_g (score_g)(score_g)'
|
|
364
|
+
# Equivalent to cluster_scores.T @ cluster_scores
|
|
365
|
+
meat = cluster_scores.T @ cluster_scores # (k, k)
|
|
366
|
+
|
|
367
|
+
# Sandwich estimator: (X'X)^{-1} meat (X'X)^{-1}
|
|
368
|
+
# Solve (X'X) temp = meat, then solve (X'X) vcov' = temp'
|
|
369
|
+
# More stable than explicit inverse
|
|
370
|
+
try:
|
|
371
|
+
temp = np.linalg.solve(XtX, meat)
|
|
372
|
+
vcov = adjustment * np.linalg.solve(XtX, temp.T).T
|
|
373
|
+
except np.linalg.LinAlgError as e:
|
|
374
|
+
if "Singular" in str(e):
|
|
375
|
+
raise ValueError(
|
|
376
|
+
"Design matrix is rank-deficient (singular X'X matrix). "
|
|
377
|
+
"This indicates perfect multicollinearity. Check your fixed effects "
|
|
378
|
+
"and covariates for linear dependencies."
|
|
379
|
+
) from e
|
|
380
|
+
raise
|
|
381
|
+
|
|
382
|
+
return vcov
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def compute_r_squared(
|
|
386
|
+
y: np.ndarray,
|
|
387
|
+
residuals: np.ndarray,
|
|
388
|
+
adjusted: bool = False,
|
|
389
|
+
n_params: int = 0,
|
|
390
|
+
) -> float:
|
|
391
|
+
"""
|
|
392
|
+
Compute R-squared or adjusted R-squared.
|
|
393
|
+
|
|
394
|
+
Parameters
|
|
395
|
+
----------
|
|
396
|
+
y : ndarray of shape (n,)
|
|
397
|
+
Response vector.
|
|
398
|
+
residuals : ndarray of shape (n,)
|
|
399
|
+
OLS residuals.
|
|
400
|
+
adjusted : bool, default False
|
|
401
|
+
If True, compute adjusted R-squared.
|
|
402
|
+
n_params : int, default 0
|
|
403
|
+
Number of parameters (including intercept). Required if adjusted=True.
|
|
404
|
+
|
|
405
|
+
Returns
|
|
406
|
+
-------
|
|
407
|
+
r_squared : float
|
|
408
|
+
R-squared or adjusted R-squared.
|
|
409
|
+
"""
|
|
410
|
+
ss_res = np.sum(residuals**2)
|
|
411
|
+
ss_tot = np.sum((y - np.mean(y)) ** 2)
|
|
412
|
+
|
|
413
|
+
if ss_tot == 0:
|
|
414
|
+
return 0.0
|
|
415
|
+
|
|
416
|
+
r_squared = 1 - (ss_res / ss_tot)
|
|
417
|
+
|
|
418
|
+
if adjusted:
|
|
419
|
+
n = len(y)
|
|
420
|
+
if n <= n_params:
|
|
421
|
+
return r_squared
|
|
422
|
+
r_squared = 1 - (1 - r_squared) * (n - 1) / (n - n_params)
|
|
423
|
+
|
|
424
|
+
return r_squared
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
# =============================================================================
|
|
428
|
+
# LinearRegression Helper Class
|
|
429
|
+
# =============================================================================
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
@dataclass
|
|
433
|
+
class InferenceResult:
|
|
434
|
+
"""
|
|
435
|
+
Container for inference results on a single coefficient.
|
|
436
|
+
|
|
437
|
+
This dataclass provides a unified way to access coefficient estimates
|
|
438
|
+
and their associated inference statistics.
|
|
439
|
+
|
|
440
|
+
Attributes
|
|
441
|
+
----------
|
|
442
|
+
coefficient : float
|
|
443
|
+
The point estimate of the coefficient.
|
|
444
|
+
se : float
|
|
445
|
+
Standard error of the coefficient.
|
|
446
|
+
t_stat : float
|
|
447
|
+
T-statistic (coefficient / se).
|
|
448
|
+
p_value : float
|
|
449
|
+
Two-sided p-value for the t-statistic.
|
|
450
|
+
conf_int : tuple of (float, float)
|
|
451
|
+
Confidence interval (lower, upper).
|
|
452
|
+
df : int or None
|
|
453
|
+
Degrees of freedom used for inference. None if using normal distribution.
|
|
454
|
+
alpha : float
|
|
455
|
+
Significance level used for confidence interval.
|
|
456
|
+
|
|
457
|
+
Examples
|
|
458
|
+
--------
|
|
459
|
+
>>> result = InferenceResult(
|
|
460
|
+
... coefficient=2.5, se=0.5, t_stat=5.0, p_value=0.001,
|
|
461
|
+
... conf_int=(1.52, 3.48), df=100, alpha=0.05
|
|
462
|
+
... )
|
|
463
|
+
>>> result.is_significant()
|
|
464
|
+
True
|
|
465
|
+
>>> result.significance_stars()
|
|
466
|
+
'***'
|
|
467
|
+
"""
|
|
468
|
+
|
|
469
|
+
coefficient: float
|
|
470
|
+
se: float
|
|
471
|
+
t_stat: float
|
|
472
|
+
p_value: float
|
|
473
|
+
conf_int: Tuple[float, float]
|
|
474
|
+
df: Optional[int] = None
|
|
475
|
+
alpha: float = 0.05
|
|
476
|
+
|
|
477
|
+
def is_significant(self, alpha: Optional[float] = None) -> bool:
|
|
478
|
+
"""Check if the coefficient is statistically significant."""
|
|
479
|
+
threshold = alpha if alpha is not None else self.alpha
|
|
480
|
+
return self.p_value < threshold
|
|
481
|
+
|
|
482
|
+
def significance_stars(self) -> str:
|
|
483
|
+
"""Return significance stars based on p-value."""
|
|
484
|
+
if self.p_value < 0.001:
|
|
485
|
+
return "***"
|
|
486
|
+
elif self.p_value < 0.01:
|
|
487
|
+
return "**"
|
|
488
|
+
elif self.p_value < 0.05:
|
|
489
|
+
return "*"
|
|
490
|
+
elif self.p_value < 0.1:
|
|
491
|
+
return "."
|
|
492
|
+
return ""
|
|
493
|
+
|
|
494
|
+
def to_dict(self) -> Dict[str, Union[float, Tuple[float, float], int, None]]:
|
|
495
|
+
"""Convert to dictionary representation."""
|
|
496
|
+
return {
|
|
497
|
+
"coefficient": self.coefficient,
|
|
498
|
+
"se": self.se,
|
|
499
|
+
"t_stat": self.t_stat,
|
|
500
|
+
"p_value": self.p_value,
|
|
501
|
+
"conf_int": self.conf_int,
|
|
502
|
+
"df": self.df,
|
|
503
|
+
"alpha": self.alpha,
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
class LinearRegression:
|
|
508
|
+
"""
|
|
509
|
+
OLS regression helper with unified coefficient extraction and inference.
|
|
510
|
+
|
|
511
|
+
This class wraps the low-level `solve_ols` function and provides a clean
|
|
512
|
+
interface for fitting regressions and extracting coefficient-level inference.
|
|
513
|
+
It eliminates code duplication across estimators by centralizing the common
|
|
514
|
+
pattern of: fit OLS -> extract coefficient -> compute SE -> compute t-stat
|
|
515
|
+
-> compute p-value -> compute CI.
|
|
516
|
+
|
|
517
|
+
Parameters
|
|
518
|
+
----------
|
|
519
|
+
include_intercept : bool, default True
|
|
520
|
+
Whether to automatically add an intercept column to the design matrix.
|
|
521
|
+
robust : bool, default True
|
|
522
|
+
Whether to use heteroskedasticity-robust (HC1) standard errors.
|
|
523
|
+
If False and cluster_ids is None, uses classical OLS standard errors.
|
|
524
|
+
cluster_ids : array-like, optional
|
|
525
|
+
Cluster identifiers for cluster-robust standard errors.
|
|
526
|
+
Overrides the `robust` parameter if provided.
|
|
527
|
+
alpha : float, default 0.05
|
|
528
|
+
Significance level for confidence intervals.
|
|
529
|
+
|
|
530
|
+
Attributes
|
|
531
|
+
----------
|
|
532
|
+
coefficients_ : ndarray
|
|
533
|
+
Fitted coefficient values (available after fit).
|
|
534
|
+
vcov_ : ndarray
|
|
535
|
+
Variance-covariance matrix (available after fit).
|
|
536
|
+
residuals_ : ndarray
|
|
537
|
+
Residuals from the fit (available after fit).
|
|
538
|
+
fitted_values_ : ndarray
|
|
539
|
+
Fitted values from the fit (available after fit).
|
|
540
|
+
n_obs_ : int
|
|
541
|
+
Number of observations (available after fit).
|
|
542
|
+
n_params_ : int
|
|
543
|
+
Number of parameters including intercept (available after fit).
|
|
544
|
+
df_ : int
|
|
545
|
+
Degrees of freedom (n - k) (available after fit).
|
|
546
|
+
|
|
547
|
+
Examples
|
|
548
|
+
--------
|
|
549
|
+
Basic usage with automatic intercept:
|
|
550
|
+
|
|
551
|
+
>>> import numpy as np
|
|
552
|
+
>>> from diff_diff.linalg import LinearRegression
|
|
553
|
+
>>> X = np.random.randn(100, 2)
|
|
554
|
+
>>> y = 1 + 2 * X[:, 0] + 3 * X[:, 1] + np.random.randn(100)
|
|
555
|
+
>>> reg = LinearRegression().fit(X, y)
|
|
556
|
+
>>> print(f"Intercept: {reg.coefficients_[0]:.2f}")
|
|
557
|
+
>>> inference = reg.get_inference(1) # inference for first predictor
|
|
558
|
+
>>> print(f"Coef: {inference.coefficient:.2f}, SE: {inference.se:.2f}")
|
|
559
|
+
|
|
560
|
+
Using with cluster-robust standard errors:
|
|
561
|
+
|
|
562
|
+
>>> cluster_ids = np.repeat(np.arange(20), 5) # 20 clusters of 5
|
|
563
|
+
>>> reg = LinearRegression(cluster_ids=cluster_ids).fit(X, y)
|
|
564
|
+
>>> inference = reg.get_inference(1)
|
|
565
|
+
>>> print(f"Cluster-robust SE: {inference.se:.2f}")
|
|
566
|
+
|
|
567
|
+
Extracting multiple coefficients at once:
|
|
568
|
+
|
|
569
|
+
>>> results = reg.get_inference_batch([1, 2])
|
|
570
|
+
>>> for idx, inf in results.items():
|
|
571
|
+
... print(f"Coef {idx}: {inf.coefficient:.2f} ({inf.significance_stars()})")
|
|
572
|
+
"""
|
|
573
|
+
|
|
574
|
+
def __init__(
|
|
575
|
+
self,
|
|
576
|
+
include_intercept: bool = True,
|
|
577
|
+
robust: bool = True,
|
|
578
|
+
cluster_ids: Optional[np.ndarray] = None,
|
|
579
|
+
alpha: float = 0.05,
|
|
580
|
+
):
|
|
581
|
+
self.include_intercept = include_intercept
|
|
582
|
+
self.robust = robust
|
|
583
|
+
self.cluster_ids = cluster_ids
|
|
584
|
+
self.alpha = alpha
|
|
585
|
+
|
|
586
|
+
# Fitted attributes (set by fit())
|
|
587
|
+
self.coefficients_: Optional[np.ndarray] = None
|
|
588
|
+
self.vcov_: Optional[np.ndarray] = None
|
|
589
|
+
self.residuals_: Optional[np.ndarray] = None
|
|
590
|
+
self.fitted_values_: Optional[np.ndarray] = None
|
|
591
|
+
self._y: Optional[np.ndarray] = None
|
|
592
|
+
self._X: Optional[np.ndarray] = None
|
|
593
|
+
self.n_obs_: Optional[int] = None
|
|
594
|
+
self.n_params_: Optional[int] = None
|
|
595
|
+
self.df_: Optional[int] = None
|
|
596
|
+
|
|
597
|
+
def fit(
|
|
598
|
+
self,
|
|
599
|
+
X: np.ndarray,
|
|
600
|
+
y: np.ndarray,
|
|
601
|
+
*,
|
|
602
|
+
cluster_ids: Optional[np.ndarray] = None,
|
|
603
|
+
df_adjustment: int = 0,
|
|
604
|
+
) -> "LinearRegression":
|
|
605
|
+
"""
|
|
606
|
+
Fit OLS regression.
|
|
607
|
+
|
|
608
|
+
Parameters
|
|
609
|
+
----------
|
|
610
|
+
X : ndarray of shape (n, k)
|
|
611
|
+
Design matrix. An intercept column will be added if include_intercept=True.
|
|
612
|
+
y : ndarray of shape (n,)
|
|
613
|
+
Response vector.
|
|
614
|
+
cluster_ids : ndarray, optional
|
|
615
|
+
Cluster identifiers for this fit. Overrides the instance-level
|
|
616
|
+
cluster_ids if provided.
|
|
617
|
+
df_adjustment : int, default 0
|
|
618
|
+
Additional degrees of freedom adjustment (e.g., for absorbed fixed effects).
|
|
619
|
+
The effective df will be n - k - df_adjustment.
|
|
620
|
+
|
|
621
|
+
Returns
|
|
622
|
+
-------
|
|
623
|
+
self : LinearRegression
|
|
624
|
+
Fitted estimator.
|
|
625
|
+
"""
|
|
626
|
+
X = np.asarray(X, dtype=np.float64)
|
|
627
|
+
y = np.asarray(y, dtype=np.float64)
|
|
628
|
+
|
|
629
|
+
# Add intercept if requested
|
|
630
|
+
if self.include_intercept:
|
|
631
|
+
X = np.column_stack([np.ones(X.shape[0]), X])
|
|
632
|
+
|
|
633
|
+
# Use provided cluster_ids or fall back to instance-level
|
|
634
|
+
effective_cluster_ids = cluster_ids if cluster_ids is not None else self.cluster_ids
|
|
635
|
+
|
|
636
|
+
# Determine if we need robust/cluster vcov
|
|
637
|
+
compute_vcov = True
|
|
638
|
+
|
|
639
|
+
if self.robust or effective_cluster_ids is not None:
|
|
640
|
+
# Use solve_ols with robust/cluster SEs
|
|
641
|
+
coefficients, residuals, fitted, vcov = solve_ols(
|
|
642
|
+
X, y,
|
|
643
|
+
cluster_ids=effective_cluster_ids,
|
|
644
|
+
return_fitted=True,
|
|
645
|
+
return_vcov=compute_vcov,
|
|
646
|
+
)
|
|
647
|
+
else:
|
|
648
|
+
# Classical OLS - compute vcov separately
|
|
649
|
+
coefficients, residuals, fitted, _ = solve_ols(
|
|
650
|
+
X, y,
|
|
651
|
+
return_fitted=True,
|
|
652
|
+
return_vcov=False,
|
|
653
|
+
)
|
|
654
|
+
# Compute classical OLS variance-covariance matrix
|
|
655
|
+
n, k = X.shape
|
|
656
|
+
mse = np.sum(residuals**2) / (n - k)
|
|
657
|
+
try:
|
|
658
|
+
vcov = np.linalg.solve(X.T @ X, mse * np.eye(k))
|
|
659
|
+
except np.linalg.LinAlgError:
|
|
660
|
+
# Fall back to pseudo-inverse for singular matrices
|
|
661
|
+
vcov = np.linalg.pinv(X.T @ X) * mse
|
|
662
|
+
|
|
663
|
+
# Store fitted attributes
|
|
664
|
+
self.coefficients_ = coefficients
|
|
665
|
+
self.vcov_ = vcov
|
|
666
|
+
self.residuals_ = residuals
|
|
667
|
+
self.fitted_values_ = fitted
|
|
668
|
+
self._y = y
|
|
669
|
+
self._X = X
|
|
670
|
+
self.n_obs_ = X.shape[0]
|
|
671
|
+
self.n_params_ = X.shape[1]
|
|
672
|
+
self.df_ = self.n_obs_ - self.n_params_ - df_adjustment
|
|
673
|
+
|
|
674
|
+
return self
|
|
675
|
+
|
|
676
|
+
def _check_fitted(self) -> None:
|
|
677
|
+
"""Raise error if model has not been fitted."""
|
|
678
|
+
if self.coefficients_ is None:
|
|
679
|
+
raise ValueError("Model has not been fitted. Call fit() first.")
|
|
680
|
+
|
|
681
|
+
def get_coefficient(self, index: int) -> float:
|
|
682
|
+
"""
|
|
683
|
+
Get the coefficient value at a specific index.
|
|
684
|
+
|
|
685
|
+
Parameters
|
|
686
|
+
----------
|
|
687
|
+
index : int
|
|
688
|
+
Index of the coefficient in the coefficient array.
|
|
689
|
+
|
|
690
|
+
Returns
|
|
691
|
+
-------
|
|
692
|
+
float
|
|
693
|
+
Coefficient value.
|
|
694
|
+
"""
|
|
695
|
+
self._check_fitted()
|
|
696
|
+
return float(self.coefficients_[index])
|
|
697
|
+
|
|
698
|
+
def get_se(self, index: int) -> float:
|
|
699
|
+
"""
|
|
700
|
+
Get the standard error for a coefficient.
|
|
701
|
+
|
|
702
|
+
Parameters
|
|
703
|
+
----------
|
|
704
|
+
index : int
|
|
705
|
+
Index of the coefficient.
|
|
706
|
+
|
|
707
|
+
Returns
|
|
708
|
+
-------
|
|
709
|
+
float
|
|
710
|
+
Standard error.
|
|
711
|
+
"""
|
|
712
|
+
self._check_fitted()
|
|
713
|
+
return float(np.sqrt(self.vcov_[index, index]))
|
|
714
|
+
|
|
715
|
+
def get_inference(
|
|
716
|
+
self,
|
|
717
|
+
index: int,
|
|
718
|
+
alpha: Optional[float] = None,
|
|
719
|
+
df: Optional[int] = None,
|
|
720
|
+
) -> InferenceResult:
|
|
721
|
+
"""
|
|
722
|
+
Get full inference results for a coefficient.
|
|
723
|
+
|
|
724
|
+
This is the primary method for extracting coefficient-level inference,
|
|
725
|
+
returning all statistics in a single call.
|
|
726
|
+
|
|
727
|
+
Parameters
|
|
728
|
+
----------
|
|
729
|
+
index : int
|
|
730
|
+
Index of the coefficient in the coefficient array.
|
|
731
|
+
alpha : float, optional
|
|
732
|
+
Significance level for CI. Defaults to instance-level alpha.
|
|
733
|
+
df : int, optional
|
|
734
|
+
Degrees of freedom. Defaults to fitted df (n - k - df_adjustment).
|
|
735
|
+
Set to None explicitly to use normal distribution instead of t.
|
|
736
|
+
|
|
737
|
+
Returns
|
|
738
|
+
-------
|
|
739
|
+
InferenceResult
|
|
740
|
+
Dataclass containing coefficient, se, t_stat, p_value, conf_int.
|
|
741
|
+
|
|
742
|
+
Examples
|
|
743
|
+
--------
|
|
744
|
+
>>> reg = LinearRegression().fit(X, y)
|
|
745
|
+
>>> result = reg.get_inference(1)
|
|
746
|
+
>>> print(f"Effect: {result.coefficient:.3f} (SE: {result.se:.3f})")
|
|
747
|
+
>>> print(f"95% CI: [{result.conf_int[0]:.3f}, {result.conf_int[1]:.3f}]")
|
|
748
|
+
>>> if result.is_significant():
|
|
749
|
+
... print("Statistically significant!")
|
|
750
|
+
"""
|
|
751
|
+
self._check_fitted()
|
|
752
|
+
|
|
753
|
+
coef = float(self.coefficients_[index])
|
|
754
|
+
se = float(np.sqrt(self.vcov_[index, index]))
|
|
755
|
+
|
|
756
|
+
# Handle zero or negative SE (indicates perfect fit or numerical issues)
|
|
757
|
+
if se <= 0:
|
|
758
|
+
import warnings
|
|
759
|
+
warnings.warn(
|
|
760
|
+
f"Standard error is zero or negative (se={se}) for coefficient at index {index}. "
|
|
761
|
+
"This may indicate perfect multicollinearity or numerical issues.",
|
|
762
|
+
UserWarning,
|
|
763
|
+
)
|
|
764
|
+
# Use inf for t-stat when SE is zero (perfect fit scenario)
|
|
765
|
+
if coef > 0:
|
|
766
|
+
t_stat = np.inf
|
|
767
|
+
elif coef < 0:
|
|
768
|
+
t_stat = -np.inf
|
|
769
|
+
else:
|
|
770
|
+
t_stat = 0.0
|
|
771
|
+
else:
|
|
772
|
+
t_stat = coef / se
|
|
773
|
+
|
|
774
|
+
# Use instance alpha if not provided
|
|
775
|
+
effective_alpha = alpha if alpha is not None else self.alpha
|
|
776
|
+
|
|
777
|
+
# Use fitted df if not explicitly provided
|
|
778
|
+
# Note: df=None means use normal distribution
|
|
779
|
+
effective_df = df if df is not None else self.df_
|
|
780
|
+
|
|
781
|
+
# Warn if df is non-positive and fall back to normal distribution
|
|
782
|
+
if effective_df is not None and effective_df <= 0:
|
|
783
|
+
import warnings
|
|
784
|
+
warnings.warn(
|
|
785
|
+
f"Degrees of freedom is non-positive (df={effective_df}). "
|
|
786
|
+
"Using normal distribution instead of t-distribution for inference.",
|
|
787
|
+
UserWarning,
|
|
788
|
+
)
|
|
789
|
+
effective_df = None
|
|
790
|
+
|
|
791
|
+
# Compute p-value
|
|
792
|
+
p_value = _compute_p_value(t_stat, df=effective_df)
|
|
793
|
+
|
|
794
|
+
# Compute confidence interval
|
|
795
|
+
conf_int = _compute_confidence_interval(coef, se, effective_alpha, df=effective_df)
|
|
796
|
+
|
|
797
|
+
return InferenceResult(
|
|
798
|
+
coefficient=coef,
|
|
799
|
+
se=se,
|
|
800
|
+
t_stat=t_stat,
|
|
801
|
+
p_value=p_value,
|
|
802
|
+
conf_int=conf_int,
|
|
803
|
+
df=effective_df,
|
|
804
|
+
alpha=effective_alpha,
|
|
805
|
+
)
|
|
806
|
+
|
|
807
|
+
def get_inference_batch(
|
|
808
|
+
self,
|
|
809
|
+
indices: List[int],
|
|
810
|
+
alpha: Optional[float] = None,
|
|
811
|
+
df: Optional[int] = None,
|
|
812
|
+
) -> Dict[int, InferenceResult]:
|
|
813
|
+
"""
|
|
814
|
+
Get inference results for multiple coefficients.
|
|
815
|
+
|
|
816
|
+
Parameters
|
|
817
|
+
----------
|
|
818
|
+
indices : list of int
|
|
819
|
+
Indices of coefficients to extract.
|
|
820
|
+
alpha : float, optional
|
|
821
|
+
Significance level for CIs. Defaults to instance-level alpha.
|
|
822
|
+
df : int, optional
|
|
823
|
+
Degrees of freedom. Defaults to fitted df.
|
|
824
|
+
|
|
825
|
+
Returns
|
|
826
|
+
-------
|
|
827
|
+
dict
|
|
828
|
+
Dictionary mapping index -> InferenceResult.
|
|
829
|
+
|
|
830
|
+
Examples
|
|
831
|
+
--------
|
|
832
|
+
>>> reg = LinearRegression().fit(X, y)
|
|
833
|
+
>>> results = reg.get_inference_batch([1, 2, 3])
|
|
834
|
+
>>> for idx, inf in results.items():
|
|
835
|
+
... print(f"Coef {idx}: {inf.coefficient:.3f} {inf.significance_stars()}")
|
|
836
|
+
"""
|
|
837
|
+
self._check_fitted()
|
|
838
|
+
return {idx: self.get_inference(idx, alpha=alpha, df=df) for idx in indices}
|
|
839
|
+
|
|
840
|
+
def get_all_inference(
|
|
841
|
+
self,
|
|
842
|
+
alpha: Optional[float] = None,
|
|
843
|
+
df: Optional[int] = None,
|
|
844
|
+
) -> List[InferenceResult]:
|
|
845
|
+
"""
|
|
846
|
+
Get inference results for all coefficients.
|
|
847
|
+
|
|
848
|
+
Parameters
|
|
849
|
+
----------
|
|
850
|
+
alpha : float, optional
|
|
851
|
+
Significance level for CIs. Defaults to instance-level alpha.
|
|
852
|
+
df : int, optional
|
|
853
|
+
Degrees of freedom. Defaults to fitted df.
|
|
854
|
+
|
|
855
|
+
Returns
|
|
856
|
+
-------
|
|
857
|
+
list of InferenceResult
|
|
858
|
+
Inference results for each coefficient in order.
|
|
859
|
+
"""
|
|
860
|
+
self._check_fitted()
|
|
861
|
+
return [
|
|
862
|
+
self.get_inference(i, alpha=alpha, df=df)
|
|
863
|
+
for i in range(len(self.coefficients_))
|
|
864
|
+
]
|
|
865
|
+
|
|
866
|
+
def r_squared(self, adjusted: bool = False) -> float:
|
|
867
|
+
"""
|
|
868
|
+
Compute R-squared or adjusted R-squared.
|
|
869
|
+
|
|
870
|
+
Parameters
|
|
871
|
+
----------
|
|
872
|
+
adjusted : bool, default False
|
|
873
|
+
If True, return adjusted R-squared.
|
|
874
|
+
|
|
875
|
+
Returns
|
|
876
|
+
-------
|
|
877
|
+
float
|
|
878
|
+
R-squared value.
|
|
879
|
+
"""
|
|
880
|
+
self._check_fitted()
|
|
881
|
+
return compute_r_squared(
|
|
882
|
+
self._y, self.residuals_, adjusted=adjusted, n_params=self.n_params_
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
def predict(self, X: np.ndarray) -> np.ndarray:
|
|
886
|
+
"""
|
|
887
|
+
Predict using the fitted model.
|
|
888
|
+
|
|
889
|
+
Parameters
|
|
890
|
+
----------
|
|
891
|
+
X : ndarray of shape (n, k)
|
|
892
|
+
Design matrix for prediction. Should have same number of columns
|
|
893
|
+
as the original X (excluding intercept if include_intercept=True).
|
|
894
|
+
|
|
895
|
+
Returns
|
|
896
|
+
-------
|
|
897
|
+
ndarray
|
|
898
|
+
Predicted values.
|
|
899
|
+
"""
|
|
900
|
+
self._check_fitted()
|
|
901
|
+
X = np.asarray(X, dtype=np.float64)
|
|
902
|
+
|
|
903
|
+
if self.include_intercept:
|
|
904
|
+
X = np.column_stack([np.ones(X.shape[0]), X])
|
|
905
|
+
|
|
906
|
+
return X @ self.coefficients_
|
|
907
|
+
|
|
908
|
+
|
|
909
|
+
# =============================================================================
|
|
910
|
+
# Internal helpers for inference (used by LinearRegression)
|
|
911
|
+
# =============================================================================
|
|
912
|
+
|
|
913
|
+
|
|
914
|
+
def _compute_p_value(
|
|
915
|
+
t_stat: float,
|
|
916
|
+
df: Optional[int] = None,
|
|
917
|
+
two_sided: bool = True,
|
|
918
|
+
) -> float:
|
|
919
|
+
"""
|
|
920
|
+
Compute p-value for a t-statistic.
|
|
921
|
+
|
|
922
|
+
Parameters
|
|
923
|
+
----------
|
|
924
|
+
t_stat : float
|
|
925
|
+
T-statistic.
|
|
926
|
+
df : int, optional
|
|
927
|
+
Degrees of freedom. If None, uses normal distribution.
|
|
928
|
+
two_sided : bool, default True
|
|
929
|
+
Whether to compute two-sided p-value.
|
|
930
|
+
|
|
931
|
+
Returns
|
|
932
|
+
-------
|
|
933
|
+
float
|
|
934
|
+
P-value.
|
|
935
|
+
"""
|
|
936
|
+
if df is not None and df > 0:
|
|
937
|
+
p_value = stats.t.sf(np.abs(t_stat), df)
|
|
938
|
+
else:
|
|
939
|
+
p_value = stats.norm.sf(np.abs(t_stat))
|
|
940
|
+
|
|
941
|
+
if two_sided:
|
|
942
|
+
p_value *= 2
|
|
943
|
+
|
|
944
|
+
return float(p_value)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def _compute_confidence_interval(
|
|
948
|
+
estimate: float,
|
|
949
|
+
se: float,
|
|
950
|
+
alpha: float = 0.05,
|
|
951
|
+
df: Optional[int] = None,
|
|
952
|
+
) -> Tuple[float, float]:
|
|
953
|
+
"""
|
|
954
|
+
Compute confidence interval for an estimate.
|
|
955
|
+
|
|
956
|
+
Parameters
|
|
957
|
+
----------
|
|
958
|
+
estimate : float
|
|
959
|
+
Point estimate.
|
|
960
|
+
se : float
|
|
961
|
+
Standard error.
|
|
962
|
+
alpha : float, default 0.05
|
|
963
|
+
Significance level (0.05 for 95% CI).
|
|
964
|
+
df : int, optional
|
|
965
|
+
Degrees of freedom. If None, uses normal distribution.
|
|
966
|
+
|
|
967
|
+
Returns
|
|
968
|
+
-------
|
|
969
|
+
tuple of (float, float)
|
|
970
|
+
(lower_bound, upper_bound) of confidence interval.
|
|
971
|
+
"""
|
|
972
|
+
if df is not None and df > 0:
|
|
973
|
+
critical_value = stats.t.ppf(1 - alpha / 2, df)
|
|
974
|
+
else:
|
|
975
|
+
critical_value = stats.norm.ppf(1 - alpha / 2)
|
|
976
|
+
|
|
977
|
+
lower = estimate - critical_value * se
|
|
978
|
+
upper = estimate + critical_value * se
|
|
979
|
+
|
|
980
|
+
return (lower, upper)
|