openstat-cli 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openstat/__init__.py +3 -0
- openstat/__main__.py +4 -0
- openstat/backends/__init__.py +16 -0
- openstat/backends/duckdb_backend.py +70 -0
- openstat/backends/polars_backend.py +52 -0
- openstat/cli.py +92 -0
- openstat/commands/__init__.py +82 -0
- openstat/commands/adv_stat_cmds.py +1255 -0
- openstat/commands/advanced_ml_cmds.py +576 -0
- openstat/commands/advreg_cmds.py +207 -0
- openstat/commands/alias_cmds.py +135 -0
- openstat/commands/arch_cmds.py +82 -0
- openstat/commands/arules_cmds.py +111 -0
- openstat/commands/automodel_cmds.py +212 -0
- openstat/commands/backend_cmds.py +82 -0
- openstat/commands/base.py +170 -0
- openstat/commands/bayes_cmds.py +71 -0
- openstat/commands/causal_cmds.py +269 -0
- openstat/commands/cluster_cmds.py +152 -0
- openstat/commands/data_cmds.py +996 -0
- openstat/commands/datamanip_cmds.py +672 -0
- openstat/commands/dataquality_cmds.py +174 -0
- openstat/commands/datetime_cmds.py +176 -0
- openstat/commands/dimreduce_cmds.py +184 -0
- openstat/commands/discrete_cmds.py +149 -0
- openstat/commands/dsl_cmds.py +143 -0
- openstat/commands/epi_cmds.py +93 -0
- openstat/commands/equiv_tobit_cmds.py +94 -0
- openstat/commands/esttab_cmds.py +196 -0
- openstat/commands/export_beamer_cmds.py +142 -0
- openstat/commands/export_cmds.py +201 -0
- openstat/commands/export_extra_cmds.py +240 -0
- openstat/commands/factor_cmds.py +180 -0
- openstat/commands/groupby_cmds.py +155 -0
- openstat/commands/help_cmds.py +237 -0
- openstat/commands/i18n_cmds.py +43 -0
- openstat/commands/import_extra_cmds.py +561 -0
- openstat/commands/influence_cmds.py +134 -0
- openstat/commands/iv_cmds.py +106 -0
- openstat/commands/manova_cmds.py +105 -0
- openstat/commands/mediate_cmds.py +233 -0
- openstat/commands/meta_cmds.py +284 -0
- openstat/commands/mi_cmds.py +228 -0
- openstat/commands/mixed_cmds.py +79 -0
- openstat/commands/mixture_changepoint_cmds.py +166 -0
- openstat/commands/ml_adv_cmds.py +147 -0
- openstat/commands/ml_cmds.py +178 -0
- openstat/commands/model_eval_cmds.py +142 -0
- openstat/commands/network_cmds.py +288 -0
- openstat/commands/nlquery_cmds.py +161 -0
- openstat/commands/nonparam_cmds.py +149 -0
- openstat/commands/outreg_cmds.py +247 -0
- openstat/commands/panel_cmds.py +141 -0
- openstat/commands/pdf_cmds.py +226 -0
- openstat/commands/pipeline_cmds.py +319 -0
- openstat/commands/plot_cmds.py +189 -0
- openstat/commands/plugin_cmds.py +79 -0
- openstat/commands/posthoc_cmds.py +153 -0
- openstat/commands/power_cmds.py +172 -0
- openstat/commands/profile_cmds.py +246 -0
- openstat/commands/rbridge_cmds.py +81 -0
- openstat/commands/regex_cmds.py +104 -0
- openstat/commands/report_cmds.py +48 -0
- openstat/commands/repro_cmds.py +129 -0
- openstat/commands/resampling_cmds.py +109 -0
- openstat/commands/reshape_cmds.py +223 -0
- openstat/commands/sem_cmds.py +177 -0
- openstat/commands/stat_cmds.py +1040 -0
- openstat/commands/stata_import_cmds.py +215 -0
- openstat/commands/string_cmds.py +124 -0
- openstat/commands/surv_cmds.py +145 -0
- openstat/commands/survey_cmds.py +153 -0
- openstat/commands/textanalysis_cmds.py +192 -0
- openstat/commands/ts_adv_cmds.py +136 -0
- openstat/commands/ts_cmds.py +195 -0
- openstat/commands/tui_cmds.py +111 -0
- openstat/commands/ux_cmds.py +191 -0
- openstat/commands/validate_cmds.py +270 -0
- openstat/commands/viz_adv_cmds.py +312 -0
- openstat/commands/viz_extra_cmds.py +251 -0
- openstat/commands/watch_cmds.py +69 -0
- openstat/config.py +106 -0
- openstat/dsl/__init__.py +0 -0
- openstat/dsl/parser.py +332 -0
- openstat/dsl/tokenizer.py +105 -0
- openstat/i18n.py +120 -0
- openstat/io/__init__.py +0 -0
- openstat/io/loader.py +187 -0
- openstat/jupyter/__init__.py +18 -0
- openstat/jupyter/display.py +18 -0
- openstat/jupyter/magic.py +60 -0
- openstat/logging_config.py +59 -0
- openstat/plots/__init__.py +0 -0
- openstat/plots/plotter.py +437 -0
- openstat/plots/surv_plots.py +32 -0
- openstat/plots/ts_plots.py +59 -0
- openstat/plugins/__init__.py +5 -0
- openstat/plugins/manager.py +69 -0
- openstat/repl.py +457 -0
- openstat/reporting/__init__.py +0 -0
- openstat/reporting/eda.py +208 -0
- openstat/reporting/report.py +67 -0
- openstat/script_runner.py +319 -0
- openstat/session.py +133 -0
- openstat/stats/__init__.py +0 -0
- openstat/stats/advanced_regression.py +269 -0
- openstat/stats/arch_garch.py +84 -0
- openstat/stats/bayesian.py +103 -0
- openstat/stats/causal.py +258 -0
- openstat/stats/clustering.py +206 -0
- openstat/stats/discrete.py +311 -0
- openstat/stats/epidemiology.py +119 -0
- openstat/stats/equiv_tobit.py +163 -0
- openstat/stats/factor.py +174 -0
- openstat/stats/imputation.py +282 -0
- openstat/stats/influence.py +78 -0
- openstat/stats/iv.py +131 -0
- openstat/stats/manova.py +124 -0
- openstat/stats/mixed.py +128 -0
- openstat/stats/ml.py +275 -0
- openstat/stats/ml_advanced.py +117 -0
- openstat/stats/model_eval.py +183 -0
- openstat/stats/models.py +1342 -0
- openstat/stats/nonparametric.py +130 -0
- openstat/stats/panel.py +179 -0
- openstat/stats/power.py +295 -0
- openstat/stats/resampling.py +203 -0
- openstat/stats/survey.py +213 -0
- openstat/stats/survival.py +196 -0
- openstat/stats/timeseries.py +142 -0
- openstat/stats/ts_advanced.py +114 -0
- openstat/types.py +11 -0
- openstat/web/__init__.py +1 -0
- openstat/web/app.py +117 -0
- openstat/web/session_manager.py +73 -0
- openstat/web/static/app.js +117 -0
- openstat/web/static/index.html +38 -0
- openstat/web/static/style.css +103 -0
- openstat_cli-1.0.0.dist-info/METADATA +748 -0
- openstat_cli-1.0.0.dist-info/RECORD +143 -0
- openstat_cli-1.0.0.dist-info/WHEEL +4 -0
- openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
- openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
openstat/stats/factor.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Factor Analysis and PCA."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from sklearn.decomposition import PCA as _SklearnPCA
|
|
10
|
+
_HAS_SKLEARN = True
|
|
11
|
+
except ImportError:
|
|
12
|
+
_HAS_SKLEARN = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ── PCA ────────────────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
def fit_pca(
|
|
18
|
+
df: pl.DataFrame,
|
|
19
|
+
cols: list[str],
|
|
20
|
+
n_components: int | None = None,
|
|
21
|
+
) -> dict:
|
|
22
|
+
"""
|
|
23
|
+
Fit PCA using numpy SVD (no sklearn required).
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
eigenvalues, loadings, explained_variance_ratio, cumulative_var,
|
|
27
|
+
scores (component scores for each observation), n_components, cols
|
|
28
|
+
"""
|
|
29
|
+
X = df.select(cols).to_numpy().astype(float)
|
|
30
|
+
# standardise
|
|
31
|
+
X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-15)
|
|
32
|
+
|
|
33
|
+
n, p = X.shape
|
|
34
|
+
if n_components is None:
|
|
35
|
+
n_components = min(n, p)
|
|
36
|
+
n_components = min(n_components, min(n, p))
|
|
37
|
+
|
|
38
|
+
cov = X.T @ X / (n - 1)
|
|
39
|
+
eigvals, eigvecs = np.linalg.eigh(cov)
|
|
40
|
+
# sort descending
|
|
41
|
+
order = np.argsort(eigvals)[::-1]
|
|
42
|
+
eigvals = eigvals[order]
|
|
43
|
+
eigvecs = eigvecs[:, order]
|
|
44
|
+
|
|
45
|
+
eigvals = eigvals[:n_components]
|
|
46
|
+
eigvecs = eigvecs[:, :n_components]
|
|
47
|
+
|
|
48
|
+
total_var = eigvals.sum()
|
|
49
|
+
if total_var <= 0:
|
|
50
|
+
total_var = 1.0
|
|
51
|
+
evr = eigvals / p # proportion of total variance (eigenvalue / p)
|
|
52
|
+
cum_var = np.cumsum(eigvals / p)
|
|
53
|
+
|
|
54
|
+
scores = X @ eigvecs
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
"eigenvalues": eigvals.tolist(),
|
|
58
|
+
"loadings": eigvecs.tolist(), # shape (p, n_components)
|
|
59
|
+
"explained_variance_ratio": evr.tolist(),
|
|
60
|
+
"cumulative_variance": cum_var.tolist(),
|
|
61
|
+
"scores": scores.tolist(),
|
|
62
|
+
"n_components": n_components,
|
|
63
|
+
"cols": cols,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ── Varimax rotation ───────────────────────────────────────────────────────
|
|
68
|
+
|
|
69
|
+
def varimax_rotation(loadings: np.ndarray, tol: float = 1e-6, max_iter: int = 1000) -> np.ndarray:
|
|
70
|
+
"""Orthogonal varimax rotation of factor loadings."""
|
|
71
|
+
p, k = loadings.shape
|
|
72
|
+
rotation = np.eye(k)
|
|
73
|
+
|
|
74
|
+
for _ in range(max_iter):
|
|
75
|
+
old_rotation = rotation.copy()
|
|
76
|
+
for i in range(k):
|
|
77
|
+
for j in range(i + 1, k):
|
|
78
|
+
x = loadings @ rotation
|
|
79
|
+
u = x[:, i] ** 2 - x[:, j] ** 2
|
|
80
|
+
v = 2 * x[:, i] * x[:, j]
|
|
81
|
+
A = v.sum()
|
|
82
|
+
B = u.sum()
|
|
83
|
+
C = (v**2 - u**2).sum()
|
|
84
|
+
D = 2 * (u * v).sum()
|
|
85
|
+
num = D - 2 * A * B / p
|
|
86
|
+
den = C - (A**2 - B**2) / p
|
|
87
|
+
if abs(den) < 1e-15:
|
|
88
|
+
continue
|
|
89
|
+
theta = 0.25 * np.arctan2(num, den)
|
|
90
|
+
c, s = np.cos(theta), np.sin(theta)
|
|
91
|
+
rot2 = np.eye(k)
|
|
92
|
+
rot2[i, i] = c
|
|
93
|
+
rot2[j, j] = c
|
|
94
|
+
rot2[i, j] = -s
|
|
95
|
+
rot2[j, i] = s
|
|
96
|
+
rotation = rotation @ rot2
|
|
97
|
+
|
|
98
|
+
if np.max(np.abs(rotation - old_rotation)) < tol:
|
|
99
|
+
break
|
|
100
|
+
|
|
101
|
+
return loadings @ rotation
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ── Factor Analysis ────────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
def fit_factor(
|
|
107
|
+
df: pl.DataFrame,
|
|
108
|
+
cols: list[str],
|
|
109
|
+
n_factors: int = 2,
|
|
110
|
+
method: str = "pc",
|
|
111
|
+
rotate: bool = True,
|
|
112
|
+
) -> dict:
|
|
113
|
+
"""
|
|
114
|
+
Fit a factor analysis model.
|
|
115
|
+
|
|
116
|
+
method: 'pc' (principal components extraction) or 'ml' (max likelihood via statsmodels if available)
|
|
117
|
+
rotate: apply varimax rotation when True
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
loadings, communalities, uniqueness, n_factors, cols
|
|
121
|
+
"""
|
|
122
|
+
X = df.select(cols).to_numpy().astype(float)
|
|
123
|
+
n, p = X.shape
|
|
124
|
+
X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-15)
|
|
125
|
+
n_factors = min(n_factors, p)
|
|
126
|
+
|
|
127
|
+
if method == "ml":
|
|
128
|
+
try:
|
|
129
|
+
import statsmodels.multivariate.factor as sm_fa # type: ignore[import]
|
|
130
|
+
fa = sm_fa.Factor(X, n_factor=n_factors, method="ml")
|
|
131
|
+
res = fa.fit()
|
|
132
|
+
loadings = np.array(res.loadings)
|
|
133
|
+
if rotate and n_factors > 1:
|
|
134
|
+
loadings = varimax_rotation(loadings)
|
|
135
|
+
communalities = (loadings**2).sum(axis=1)
|
|
136
|
+
uniqueness = 1 - communalities
|
|
137
|
+
return {
|
|
138
|
+
"method": "ml",
|
|
139
|
+
"loadings": loadings.tolist(),
|
|
140
|
+
"communalities": communalities.tolist(),
|
|
141
|
+
"uniqueness": uniqueness.tolist(),
|
|
142
|
+
"n_factors": n_factors,
|
|
143
|
+
"cols": cols,
|
|
144
|
+
}
|
|
145
|
+
except Exception:
|
|
146
|
+
pass # fall through to PC method
|
|
147
|
+
|
|
148
|
+
# Principal components extraction
|
|
149
|
+
cov = X.T @ X / (n - 1)
|
|
150
|
+
eigvals, eigvecs = np.linalg.eigh(cov)
|
|
151
|
+
order = np.argsort(eigvals)[::-1]
|
|
152
|
+
eigvals = eigvals[order]
|
|
153
|
+
eigvecs = eigvecs[:, order]
|
|
154
|
+
|
|
155
|
+
eigvals_k = eigvals[:n_factors]
|
|
156
|
+
eigvecs_k = eigvecs[:, :n_factors]
|
|
157
|
+
|
|
158
|
+
loadings = eigvecs_k * np.sqrt(np.maximum(eigvals_k, 0))
|
|
159
|
+
|
|
160
|
+
if rotate and n_factors > 1:
|
|
161
|
+
loadings = varimax_rotation(loadings)
|
|
162
|
+
|
|
163
|
+
communalities = (loadings**2).sum(axis=1)
|
|
164
|
+
uniqueness = 1 - communalities
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
"method": "pc",
|
|
168
|
+
"eigenvalues": eigvals.tolist(),
|
|
169
|
+
"loadings": loadings.tolist(),
|
|
170
|
+
"communalities": communalities.tolist(),
|
|
171
|
+
"uniqueness": uniqueness.tolist(),
|
|
172
|
+
"n_factors": n_factors,
|
|
173
|
+
"cols": cols,
|
|
174
|
+
}
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Multiple Imputation by Chained Equations (MICE) and Rubin's rules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import polars as pl
|
|
9
|
+
import statsmodels.api as sm
|
|
10
|
+
from scipy import stats as sp_stats
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class MIResult:
|
|
15
|
+
"""Combined result from multiple imputation using Rubin's rules."""
|
|
16
|
+
|
|
17
|
+
model_type: str
|
|
18
|
+
formula: str
|
|
19
|
+
m: int # number of imputations
|
|
20
|
+
params: dict[str, float]
|
|
21
|
+
std_errors: dict[str, float]
|
|
22
|
+
t_values: dict[str, float]
|
|
23
|
+
p_values: dict[str, float]
|
|
24
|
+
conf_int_low: dict[str, float]
|
|
25
|
+
conf_int_high: dict[str, float]
|
|
26
|
+
n_obs: int
|
|
27
|
+
within_var: dict[str, float] # U_bar
|
|
28
|
+
between_var: dict[str, float] # B
|
|
29
|
+
fmi: dict[str, float] # fraction of missing information
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _initial_fill(series: np.ndarray, rng: np.random.Generator) -> np.ndarray:
|
|
33
|
+
"""Initialize missing values by sampling from observed."""
|
|
34
|
+
result = series.copy()
|
|
35
|
+
mask = np.isnan(result)
|
|
36
|
+
observed = result[~mask]
|
|
37
|
+
if len(observed) > 0 and mask.any():
|
|
38
|
+
result[mask] = rng.choice(observed, size=mask.sum())
|
|
39
|
+
return result
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _impute_regression(data: np.ndarray, col_idx: int, rng: np.random.Generator) -> None:
|
|
43
|
+
"""Impute a continuous variable using Bayesian linear regression."""
|
|
44
|
+
mask = np.isnan(data[:, col_idx])
|
|
45
|
+
if not mask.any():
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
obs_idx = ~mask
|
|
49
|
+
predictors = np.delete(data, col_idx, axis=1)
|
|
50
|
+
|
|
51
|
+
y_obs = data[obs_idx, col_idx]
|
|
52
|
+
X_obs = sm.add_constant(predictors[obs_idx])
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
model = sm.OLS(y_obs, X_obs).fit()
|
|
56
|
+
# Draw from posterior (proper imputation)
|
|
57
|
+
beta_hat = model.params
|
|
58
|
+
sigma2 = model.scale
|
|
59
|
+
# Draw sigma from scaled inverse chi-squared
|
|
60
|
+
n = len(y_obs)
|
|
61
|
+
k = len(beta_hat)
|
|
62
|
+
sigma2_draw = sigma2 * (n - k) / rng.chisquare(n - k)
|
|
63
|
+
# Draw beta from multivariate normal
|
|
64
|
+
cov = model.cov_params() * sigma2_draw / sigma2
|
|
65
|
+
beta_draw = rng.multivariate_normal(beta_hat, cov)
|
|
66
|
+
|
|
67
|
+
# Predict missing
|
|
68
|
+
X_miss = sm.add_constant(predictors[mask])
|
|
69
|
+
y_pred = X_miss @ beta_draw
|
|
70
|
+
y_pred += rng.normal(0, np.sqrt(sigma2_draw), size=len(y_pred))
|
|
71
|
+
data[mask, col_idx] = y_pred
|
|
72
|
+
except Exception:
|
|
73
|
+
# Fallback: simple mean imputation
|
|
74
|
+
data[mask, col_idx] = np.nanmean(data[:, col_idx])
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _impute_logit(data: np.ndarray, col_idx: int, rng: np.random.Generator) -> None:
|
|
78
|
+
"""Impute a binary variable using logistic regression."""
|
|
79
|
+
mask = np.isnan(data[:, col_idx])
|
|
80
|
+
if not mask.any():
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
obs_idx = ~mask
|
|
84
|
+
predictors = np.delete(data, col_idx, axis=1)
|
|
85
|
+
|
|
86
|
+
y_obs = data[obs_idx, col_idx]
|
|
87
|
+
X_obs = sm.add_constant(predictors[obs_idx])
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
model = sm.Logit(y_obs, X_obs).fit(disp=0)
|
|
91
|
+
beta_hat = model.params
|
|
92
|
+
cov = model.cov_params()
|
|
93
|
+
beta_draw = rng.multivariate_normal(beta_hat, cov)
|
|
94
|
+
|
|
95
|
+
X_miss = sm.add_constant(predictors[mask])
|
|
96
|
+
logits = X_miss @ beta_draw
|
|
97
|
+
probs = 1 / (1 + np.exp(-logits))
|
|
98
|
+
data[mask, col_idx] = (rng.random(size=len(probs)) < probs).astype(float)
|
|
99
|
+
except Exception:
|
|
100
|
+
data[mask, col_idx] = np.round(np.nanmean(data[:, col_idx]))
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _impute_pmm(data: np.ndarray, col_idx: int, rng: np.random.Generator, k: int = 5) -> None:
|
|
104
|
+
"""Impute using Predictive Mean Matching."""
|
|
105
|
+
mask = np.isnan(data[:, col_idx])
|
|
106
|
+
if not mask.any():
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
obs_idx = ~mask
|
|
110
|
+
predictors = np.delete(data, col_idx, axis=1)
|
|
111
|
+
|
|
112
|
+
y_obs = data[obs_idx, col_idx]
|
|
113
|
+
X_obs = sm.add_constant(predictors[obs_idx])
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
model = sm.OLS(y_obs, X_obs).fit()
|
|
117
|
+
y_hat_obs = model.predict(X_obs)
|
|
118
|
+
|
|
119
|
+
beta_draw = rng.multivariate_normal(model.params, model.cov_params())
|
|
120
|
+
X_miss = sm.add_constant(predictors[mask])
|
|
121
|
+
y_hat_miss = X_miss @ beta_draw
|
|
122
|
+
|
|
123
|
+
# For each missing, find k nearest donors and sample
|
|
124
|
+
for i, pred in enumerate(y_hat_miss):
|
|
125
|
+
distances = np.abs(y_hat_obs - pred)
|
|
126
|
+
donor_indices = np.argsort(distances)[:k]
|
|
127
|
+
chosen = rng.choice(donor_indices)
|
|
128
|
+
data[mask, col_idx] = y_obs.iloc[chosen] if hasattr(y_obs, 'iloc') else y_obs[chosen]
|
|
129
|
+
except Exception:
|
|
130
|
+
data[mask, col_idx] = np.nanmean(data[:, col_idx])
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def mice_impute(
|
|
134
|
+
df: pl.DataFrame,
|
|
135
|
+
specs: list[tuple[str, str]],
|
|
136
|
+
m: int = 5,
|
|
137
|
+
max_iter: int = 10,
|
|
138
|
+
seed: int = 42,
|
|
139
|
+
) -> list[pl.DataFrame]:
|
|
140
|
+
"""Run MICE (Multiple Imputation by Chained Equations).
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
df: Input DataFrame with missing values.
|
|
144
|
+
specs: List of (method, column) tuples.
|
|
145
|
+
method: "regress", "logit", "pmm"
|
|
146
|
+
m: Number of imputed datasets.
|
|
147
|
+
max_iter: Number of MICE iterations.
|
|
148
|
+
seed: Random seed.
|
|
149
|
+
|
|
150
|
+
Returns list of m imputed DataFrames.
|
|
151
|
+
"""
|
|
152
|
+
rng = np.random.default_rng(seed)
|
|
153
|
+
col_names = [col for _, col in specs]
|
|
154
|
+
other_cols = [c for c in df.columns if c not in col_names]
|
|
155
|
+
all_cols = col_names + other_cols
|
|
156
|
+
|
|
157
|
+
imputed_datasets: list[pl.DataFrame] = []
|
|
158
|
+
|
|
159
|
+
for imp in range(m):
|
|
160
|
+
# Convert to numpy for fast computation
|
|
161
|
+
data = df.select(all_cols).to_numpy().astype(float)
|
|
162
|
+
n_imp_cols = len(col_names)
|
|
163
|
+
|
|
164
|
+
# Initialize missing with random draws from observed
|
|
165
|
+
for i in range(n_imp_cols):
|
|
166
|
+
data[:, i] = _initial_fill(data[:, i], rng)
|
|
167
|
+
|
|
168
|
+
# Iterate chained equations
|
|
169
|
+
for _ in range(max_iter):
|
|
170
|
+
for i, (method, _col) in enumerate(specs):
|
|
171
|
+
# Temporarily set imputed values back to NaN for this col
|
|
172
|
+
orig = df[_col].to_numpy().astype(float)
|
|
173
|
+
was_missing = np.isnan(orig)
|
|
174
|
+
save = data[was_missing, i].copy()
|
|
175
|
+
data[was_missing, i] = np.nan
|
|
176
|
+
|
|
177
|
+
if method == "regress":
|
|
178
|
+
_impute_regression(data, i, rng)
|
|
179
|
+
elif method == "logit":
|
|
180
|
+
_impute_logit(data, i, rng)
|
|
181
|
+
elif method == "pmm":
|
|
182
|
+
_impute_pmm(data, i, rng)
|
|
183
|
+
else:
|
|
184
|
+
# Default to regression
|
|
185
|
+
_impute_regression(data, i, rng)
|
|
186
|
+
|
|
187
|
+
# Convert back to Polars
|
|
188
|
+
imputed_df = pl.DataFrame({
|
|
189
|
+
col: data[:, i] for i, col in enumerate(all_cols)
|
|
190
|
+
})
|
|
191
|
+
imputed_datasets.append(imputed_df)
|
|
192
|
+
|
|
193
|
+
return imputed_datasets
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def rubins_rules(
|
|
197
|
+
estimates: list[dict[str, float]],
|
|
198
|
+
std_errors: list[dict[str, float]],
|
|
199
|
+
n_obs: int,
|
|
200
|
+
) -> MIResult:
|
|
201
|
+
"""Combine estimates from m imputed datasets using Rubin's rules.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
estimates: List of param dicts from each imputed dataset.
|
|
205
|
+
std_errors: List of SE dicts from each imputed dataset.
|
|
206
|
+
n_obs: Number of observations.
|
|
207
|
+
|
|
208
|
+
Returns MIResult with combined estimates.
|
|
209
|
+
"""
|
|
210
|
+
m = len(estimates)
|
|
211
|
+
var_names = list(estimates[0].keys())
|
|
212
|
+
|
|
213
|
+
combined_params: dict[str, float] = {}
|
|
214
|
+
combined_se: dict[str, float] = {}
|
|
215
|
+
combined_t: dict[str, float] = {}
|
|
216
|
+
combined_p: dict[str, float] = {}
|
|
217
|
+
combined_ci_low: dict[str, float] = {}
|
|
218
|
+
combined_ci_high: dict[str, float] = {}
|
|
219
|
+
within_var: dict[str, float] = {}
|
|
220
|
+
between_var: dict[str, float] = {}
|
|
221
|
+
fmi_dict: dict[str, float] = {}
|
|
222
|
+
|
|
223
|
+
for var in var_names:
|
|
224
|
+
# Point estimate: average across imputations
|
|
225
|
+
q_vals = np.array([est[var] for est in estimates])
|
|
226
|
+
q_bar = float(np.mean(q_vals))
|
|
227
|
+
|
|
228
|
+
# Within-imputation variance
|
|
229
|
+
u_vals = np.array([se[var] ** 2 for se in std_errors])
|
|
230
|
+
u_bar = float(np.mean(u_vals))
|
|
231
|
+
|
|
232
|
+
# Between-imputation variance
|
|
233
|
+
b = float(np.var(q_vals, ddof=1))
|
|
234
|
+
|
|
235
|
+
# Total variance
|
|
236
|
+
t = u_bar + (1 + 1 / m) * b
|
|
237
|
+
|
|
238
|
+
# Degrees of freedom (Barnard-Rubin)
|
|
239
|
+
if b > 0 and u_bar > 0:
|
|
240
|
+
r = (1 + 1 / m) * b / u_bar
|
|
241
|
+
df_old = (m - 1) * (1 + 1 / r) ** 2
|
|
242
|
+
df_obs = (n_obs - len(var_names) + 1) / (n_obs - len(var_names) + 3) * (n_obs - len(var_names)) * (1 - r)
|
|
243
|
+
if df_obs > 0:
|
|
244
|
+
df = (df_old * df_obs) / (df_old + df_obs)
|
|
245
|
+
else:
|
|
246
|
+
df = df_old
|
|
247
|
+
fmi = (r + 2 / (df + 3)) / (r + 1)
|
|
248
|
+
else:
|
|
249
|
+
df = max(n_obs - len(var_names), 1)
|
|
250
|
+
fmi = 0.0
|
|
251
|
+
|
|
252
|
+
se = np.sqrt(t)
|
|
253
|
+
t_val = q_bar / se if se > 0 else 0.0
|
|
254
|
+
p_val = float(2 * (1 - sp_stats.t.cdf(abs(t_val), df))) if df > 0 else 1.0
|
|
255
|
+
ci_low = q_bar - 1.96 * se
|
|
256
|
+
ci_high = q_bar + 1.96 * se
|
|
257
|
+
|
|
258
|
+
combined_params[var] = q_bar
|
|
259
|
+
combined_se[var] = float(se)
|
|
260
|
+
combined_t[var] = t_val
|
|
261
|
+
combined_p[var] = p_val
|
|
262
|
+
combined_ci_low[var] = ci_low
|
|
263
|
+
combined_ci_high[var] = ci_high
|
|
264
|
+
within_var[var] = u_bar
|
|
265
|
+
between_var[var] = b
|
|
266
|
+
fmi_dict[var] = fmi
|
|
267
|
+
|
|
268
|
+
return MIResult(
|
|
269
|
+
model_type="MI",
|
|
270
|
+
formula="",
|
|
271
|
+
m=m,
|
|
272
|
+
params=combined_params,
|
|
273
|
+
std_errors=combined_se,
|
|
274
|
+
t_values=combined_t,
|
|
275
|
+
p_values=combined_p,
|
|
276
|
+
conf_int_low=combined_ci_low,
|
|
277
|
+
conf_int_high=combined_ci_high,
|
|
278
|
+
n_obs=n_obs,
|
|
279
|
+
within_var=within_var,
|
|
280
|
+
between_var=between_var,
|
|
281
|
+
fmi=fmi_dict,
|
|
282
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Influence diagnostics: leverage, Cook's D, DFBETAs, outlier detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def compute_influence(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
|
|
10
|
+
"""Compute OLS influence statistics: leverage, Cook's D, DFBETAs, studentized residuals."""
|
|
11
|
+
sub = df.select([dep] + indeps).drop_nulls()
|
|
12
|
+
y = sub[dep].to_numpy().astype(float)
|
|
13
|
+
X_raw = sub.select(indeps).to_numpy().astype(float)
|
|
14
|
+
n, k = X_raw.shape
|
|
15
|
+
X = np.column_stack([np.ones(n), X_raw])
|
|
16
|
+
kp = k + 1
|
|
17
|
+
|
|
18
|
+
# OLS fit
|
|
19
|
+
XtX_inv = np.linalg.pinv(X.T @ X)
|
|
20
|
+
beta = XtX_inv @ X.T @ y
|
|
21
|
+
y_hat = X @ beta
|
|
22
|
+
resid = y - y_hat
|
|
23
|
+
mse = (resid @ resid) / (n - kp)
|
|
24
|
+
|
|
25
|
+
# Hat matrix diagonal (leverage)
|
|
26
|
+
H = X @ XtX_inv @ X.T
|
|
27
|
+
leverage = np.diag(H)
|
|
28
|
+
|
|
29
|
+
# Studentized residuals (internal)
|
|
30
|
+
sigma = np.sqrt(mse)
|
|
31
|
+
std_resid = resid / (sigma * np.sqrt(1 - leverage + 1e-10))
|
|
32
|
+
|
|
33
|
+
# Cook's Distance
|
|
34
|
+
cooks_d = (std_resid ** 2 * leverage) / (kp * (1 - leverage + 1e-10))
|
|
35
|
+
|
|
36
|
+
# DFBETAs per coefficient
|
|
37
|
+
dfbetas = {}
|
|
38
|
+
se_beta = np.sqrt(mse * np.diag(XtX_inv))
|
|
39
|
+
for j in range(kp):
|
|
40
|
+
name = "_cons" if j == 0 else indeps[j - 1]
|
|
41
|
+
# Approximation: dfbeta_j = h_j * resid_j / (se_beta_j * (1-lev_j))
|
|
42
|
+
dfb = (X[:, j] * resid) / ((n - kp - 1) * se_beta[j] + 1e-10)
|
|
43
|
+
dfbetas[name] = dfb.tolist()
|
|
44
|
+
|
|
45
|
+
# Mahalanobis distance on X for outlier detection
|
|
46
|
+
X_centered = X_raw - X_raw.mean(axis=0)
|
|
47
|
+
try:
|
|
48
|
+
cov_inv = np.linalg.pinv(np.cov(X_raw.T))
|
|
49
|
+
mahal = np.array([float(x @ cov_inv @ x) for x in X_centered])
|
|
50
|
+
except Exception:
|
|
51
|
+
mahal = np.zeros(n)
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
"n_obs": n,
|
|
55
|
+
"n_params": kp,
|
|
56
|
+
"leverage": leverage.tolist(),
|
|
57
|
+
"cooks_d": cooks_d.tolist(),
|
|
58
|
+
"std_residuals": std_resid.tolist(),
|
|
59
|
+
"mahalanobis": mahal.tolist(),
|
|
60
|
+
"dfbetas": dfbetas,
|
|
61
|
+
"high_leverage_threshold": 2 * kp / n,
|
|
62
|
+
"high_cooks_threshold": 4 / n,
|
|
63
|
+
"n_high_leverage": int((leverage > 2 * kp / n).sum()),
|
|
64
|
+
"n_high_cooks": int((cooks_d > 4 / n).sum()),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def detect_outliers(df: pl.DataFrame, dep: str, indeps: list[str], threshold: float = 3.0) -> dict:
|
|
69
|
+
"""Identify outliers by studentized residuals > threshold."""
|
|
70
|
+
inf = compute_influence(df, dep, indeps)
|
|
71
|
+
std_resid = np.array(inf["std_residuals"])
|
|
72
|
+
outlier_idx = np.where(np.abs(std_resid) > threshold)[0].tolist()
|
|
73
|
+
return {
|
|
74
|
+
"outlier_indices": outlier_idx,
|
|
75
|
+
"n_outliers": len(outlier_idx),
|
|
76
|
+
"threshold": threshold,
|
|
77
|
+
"std_residuals": inf["std_residuals"],
|
|
78
|
+
}
|
openstat/stats/iv.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Instrumental variables: 2SLS, first-stage diagnostics, overidentification tests."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import polars as pl
|
|
7
|
+
import statsmodels.api as sm
|
|
8
|
+
|
|
9
|
+
from openstat.stats.models import FitResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _try_import_linearmodels():
|
|
13
|
+
try:
|
|
14
|
+
import linearmodels # noqa: F401
|
|
15
|
+
except ImportError:
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"IV models require linearmodels. "
|
|
18
|
+
"Install it with: pip install openstat[panel]"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _iv_to_fit_result(result, dep: str, exog: list[str], endog: list[str], instruments: list[str]) -> FitResult:
|
|
23
|
+
"""Convert linearmodels IVResults to FitResult."""
|
|
24
|
+
all_vars = list(result.params.index)
|
|
25
|
+
params = {name: float(val) for name, val in result.params.items()}
|
|
26
|
+
std_errors = {name: float(val) for name, val in result.std_errors.items()}
|
|
27
|
+
t_values = {name: float(val) for name, val in result.tstats.items()}
|
|
28
|
+
p_values = {name: float(val) for name, val in result.pvalues.items()}
|
|
29
|
+
ci = result.conf_int()
|
|
30
|
+
conf_low = {name: float(ci.loc[name, "lower"]) for name in all_vars}
|
|
31
|
+
conf_high = {name: float(ci.loc[name, "upper"]) for name in all_vars}
|
|
32
|
+
|
|
33
|
+
warnings_list: list[str] = []
|
|
34
|
+
warnings_list.append(f"Endogenous: {', '.join(endog)}")
|
|
35
|
+
warnings_list.append(f"Instruments: {', '.join(instruments)}")
|
|
36
|
+
|
|
37
|
+
return FitResult(
|
|
38
|
+
model_type="IV-2SLS",
|
|
39
|
+
formula=f"{dep} ~ {' + '.join(exog)} + ({' + '.join(endog)} = {' + '.join(instruments)})",
|
|
40
|
+
dep_var=dep,
|
|
41
|
+
indep_vars=all_vars,
|
|
42
|
+
n_obs=int(result.nobs),
|
|
43
|
+
params=params,
|
|
44
|
+
std_errors=std_errors,
|
|
45
|
+
t_values=t_values,
|
|
46
|
+
p_values=p_values,
|
|
47
|
+
conf_int_low=conf_low,
|
|
48
|
+
conf_int_high=conf_high,
|
|
49
|
+
r_squared=float(result.rsquared) if hasattr(result, "rsquared") else None,
|
|
50
|
+
f_statistic=float(result.f_statistic.stat) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
|
|
51
|
+
f_pvalue=float(result.f_statistic.pval) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
|
|
52
|
+
warnings=warnings_list,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def fit_iv_2sls(
|
|
57
|
+
df: pl.DataFrame,
|
|
58
|
+
dep: str,
|
|
59
|
+
exog: list[str],
|
|
60
|
+
endog: list[str],
|
|
61
|
+
instruments: list[str],
|
|
62
|
+
robust: bool = False,
|
|
63
|
+
) -> tuple[FitResult, object]:
|
|
64
|
+
"""Fit an IV model via Two-Stage Least Squares."""
|
|
65
|
+
_try_import_linearmodels()
|
|
66
|
+
from linearmodels.iv import IV2SLS
|
|
67
|
+
|
|
68
|
+
all_cols = [dep] + exog + endog + instruments
|
|
69
|
+
pdf = df.select(all_cols).to_pandas().dropna()
|
|
70
|
+
|
|
71
|
+
dep_data = pdf[dep]
|
|
72
|
+
exog_data = sm.add_constant(pdf[exog]) if exog else sm.add_constant(pdf[[]])
|
|
73
|
+
endog_data = pdf[endog]
|
|
74
|
+
instr_data = pdf[instruments]
|
|
75
|
+
|
|
76
|
+
model = IV2SLS(dep_data, exog_data, endog_data, instr_data)
|
|
77
|
+
cov_type = "robust" if robust else "unadjusted"
|
|
78
|
+
result = model.fit(cov_type=cov_type)
|
|
79
|
+
|
|
80
|
+
fit = _iv_to_fit_result(result, dep, ["const"] + exog, endog, instruments)
|
|
81
|
+
return fit, result
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def first_stage_diagnostics(iv_result) -> str:
|
|
85
|
+
"""Report first-stage regression diagnostics."""
|
|
86
|
+
lines = ["First-Stage Diagnostics:"]
|
|
87
|
+
try:
|
|
88
|
+
fs = iv_result.first_stage
|
|
89
|
+
for endog_var in fs.diagnostics:
|
|
90
|
+
diag = fs.diagnostics[endog_var]
|
|
91
|
+
lines.append(f"\n Endogenous: {endog_var}")
|
|
92
|
+
lines.append(f" Partial R²: {diag.rsquared:.4f}")
|
|
93
|
+
lines.append(f" Partial F-stat: {diag.f_stat.stat:.2f} (p={diag.f_stat.pval:.4f})")
|
|
94
|
+
if diag.f_stat.stat < 10:
|
|
95
|
+
lines.append(" ⚠ Weak instruments (F < 10)")
|
|
96
|
+
except Exception as e:
|
|
97
|
+
lines.append(f" Could not compute: {e}")
|
|
98
|
+
return "\n".join(lines)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def overidentification_test(iv_result) -> str:
|
|
102
|
+
"""Sargan/Hansen J-test for overidentifying restrictions."""
|
|
103
|
+
lines = ["Overidentification Test (Sargan/Hansen J):"]
|
|
104
|
+
try:
|
|
105
|
+
j_test = iv_result.sargan
|
|
106
|
+
lines.append(f" J-statistic: {j_test.stat:.4f}")
|
|
107
|
+
lines.append(f" p-value: {j_test.pval:.4f}")
|
|
108
|
+
lines.append(f" df: {j_test.df}")
|
|
109
|
+
if j_test.pval < 0.05:
|
|
110
|
+
lines.append(" ⚠ Reject H0: instruments may not be valid")
|
|
111
|
+
else:
|
|
112
|
+
lines.append(" ✓ Cannot reject H0: instruments appear valid")
|
|
113
|
+
except Exception as e:
|
|
114
|
+
lines.append(f" Not available (exactly identified or error: {e})")
|
|
115
|
+
return "\n".join(lines)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def endogeneity_test(iv_result) -> str:
|
|
119
|
+
"""Durbin-Wu-Hausman test for endogeneity."""
|
|
120
|
+
lines = ["Endogeneity Test (Durbin-Wu-Hausman):"]
|
|
121
|
+
try:
|
|
122
|
+
wu_test = iv_result.wu_hausman()
|
|
123
|
+
lines.append(f" Statistic: {wu_test.stat:.4f}")
|
|
124
|
+
lines.append(f" p-value: {wu_test.pval:.4f}")
|
|
125
|
+
if wu_test.pval < 0.05:
|
|
126
|
+
lines.append(" ⚠ Reject H0: endogeneity detected — IV is appropriate")
|
|
127
|
+
else:
|
|
128
|
+
lines.append(" ✓ Cannot reject exogeneity — OLS may be sufficient")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
lines.append(f" Could not compute: {e}")
|
|
131
|
+
return "\n".join(lines)
|