openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,174 @@
1
+ """Factor Analysis and PCA."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+ try:
9
+ from sklearn.decomposition import PCA as _SklearnPCA
10
+ _HAS_SKLEARN = True
11
+ except ImportError:
12
+ _HAS_SKLEARN = False
13
+
14
+
15
+ # ── PCA ────────────────────────────────────────────────────────────────────
16
+
17
+ def fit_pca(
18
+ df: pl.DataFrame,
19
+ cols: list[str],
20
+ n_components: int | None = None,
21
+ ) -> dict:
22
+ """
23
+ Fit PCA using numpy SVD (no sklearn required).
24
+
25
+ Returns:
26
+ eigenvalues, loadings, explained_variance_ratio, cumulative_var,
27
+ scores (component scores for each observation), n_components, cols
28
+ """
29
+ X = df.select(cols).to_numpy().astype(float)
30
+ # standardise
31
+ X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-15)
32
+
33
+ n, p = X.shape
34
+ if n_components is None:
35
+ n_components = min(n, p)
36
+ n_components = min(n_components, min(n, p))
37
+
38
+ cov = X.T @ X / (n - 1)
39
+ eigvals, eigvecs = np.linalg.eigh(cov)
40
+ # sort descending
41
+ order = np.argsort(eigvals)[::-1]
42
+ eigvals = eigvals[order]
43
+ eigvecs = eigvecs[:, order]
44
+
45
+ eigvals = eigvals[:n_components]
46
+ eigvecs = eigvecs[:, :n_components]
47
+
48
+ total_var = eigvals.sum()
49
+ if total_var <= 0:
50
+ total_var = 1.0
51
+ evr = eigvals / p # proportion of total variance (eigenvalue / p)
52
+ cum_var = np.cumsum(eigvals / p)
53
+
54
+ scores = X @ eigvecs
55
+
56
+ return {
57
+ "eigenvalues": eigvals.tolist(),
58
+ "loadings": eigvecs.tolist(), # shape (p, n_components)
59
+ "explained_variance_ratio": evr.tolist(),
60
+ "cumulative_variance": cum_var.tolist(),
61
+ "scores": scores.tolist(),
62
+ "n_components": n_components,
63
+ "cols": cols,
64
+ }
65
+
66
+
67
+ # ── Varimax rotation ───────────────────────────────────────────────────────
68
+
69
+ def varimax_rotation(loadings: np.ndarray, tol: float = 1e-6, max_iter: int = 1000) -> np.ndarray:
70
+ """Orthogonal varimax rotation of factor loadings."""
71
+ p, k = loadings.shape
72
+ rotation = np.eye(k)
73
+
74
+ for _ in range(max_iter):
75
+ old_rotation = rotation.copy()
76
+ for i in range(k):
77
+ for j in range(i + 1, k):
78
+ x = loadings @ rotation
79
+ u = x[:, i] ** 2 - x[:, j] ** 2
80
+ v = 2 * x[:, i] * x[:, j]
81
+ A = v.sum()
82
+ B = u.sum()
83
+ C = (v**2 - u**2).sum()
84
+ D = 2 * (u * v).sum()
85
+ num = D - 2 * A * B / p
86
+ den = C - (A**2 - B**2) / p
87
+ if abs(den) < 1e-15:
88
+ continue
89
+ theta = 0.25 * np.arctan2(num, den)
90
+ c, s = np.cos(theta), np.sin(theta)
91
+ rot2 = np.eye(k)
92
+ rot2[i, i] = c
93
+ rot2[j, j] = c
94
+ rot2[i, j] = -s
95
+ rot2[j, i] = s
96
+ rotation = rotation @ rot2
97
+
98
+ if np.max(np.abs(rotation - old_rotation)) < tol:
99
+ break
100
+
101
+ return loadings @ rotation
102
+
103
+
104
+ # ── Factor Analysis ────────────────────────────────────────────────────────
105
+
106
+ def fit_factor(
107
+ df: pl.DataFrame,
108
+ cols: list[str],
109
+ n_factors: int = 2,
110
+ method: str = "pc",
111
+ rotate: bool = True,
112
+ ) -> dict:
113
+ """
114
+ Fit a factor analysis model.
115
+
116
+ method: 'pc' (principal components extraction) or 'ml' (max likelihood via statsmodels if available)
117
+ rotate: apply varimax rotation when True
118
+
119
+ Returns:
120
+ loadings, communalities, uniqueness, n_factors, cols
121
+ """
122
+ X = df.select(cols).to_numpy().astype(float)
123
+ n, p = X.shape
124
+ X = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-15)
125
+ n_factors = min(n_factors, p)
126
+
127
+ if method == "ml":
128
+ try:
129
+ import statsmodels.multivariate.factor as sm_fa # type: ignore[import]
130
+ fa = sm_fa.Factor(X, n_factor=n_factors, method="ml")
131
+ res = fa.fit()
132
+ loadings = np.array(res.loadings)
133
+ if rotate and n_factors > 1:
134
+ loadings = varimax_rotation(loadings)
135
+ communalities = (loadings**2).sum(axis=1)
136
+ uniqueness = 1 - communalities
137
+ return {
138
+ "method": "ml",
139
+ "loadings": loadings.tolist(),
140
+ "communalities": communalities.tolist(),
141
+ "uniqueness": uniqueness.tolist(),
142
+ "n_factors": n_factors,
143
+ "cols": cols,
144
+ }
145
+ except Exception:
146
+ pass # fall through to PC method
147
+
148
+ # Principal components extraction
149
+ cov = X.T @ X / (n - 1)
150
+ eigvals, eigvecs = np.linalg.eigh(cov)
151
+ order = np.argsort(eigvals)[::-1]
152
+ eigvals = eigvals[order]
153
+ eigvecs = eigvecs[:, order]
154
+
155
+ eigvals_k = eigvals[:n_factors]
156
+ eigvecs_k = eigvecs[:, :n_factors]
157
+
158
+ loadings = eigvecs_k * np.sqrt(np.maximum(eigvals_k, 0))
159
+
160
+ if rotate and n_factors > 1:
161
+ loadings = varimax_rotation(loadings)
162
+
163
+ communalities = (loadings**2).sum(axis=1)
164
+ uniqueness = 1 - communalities
165
+
166
+ return {
167
+ "method": "pc",
168
+ "eigenvalues": eigvals.tolist(),
169
+ "loadings": loadings.tolist(),
170
+ "communalities": communalities.tolist(),
171
+ "uniqueness": uniqueness.tolist(),
172
+ "n_factors": n_factors,
173
+ "cols": cols,
174
+ }
@@ -0,0 +1,282 @@
1
+ """Multiple Imputation by Chained Equations (MICE) and Rubin's rules."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+ import numpy as np
8
+ import polars as pl
9
+ import statsmodels.api as sm
10
+ from scipy import stats as sp_stats
11
+
12
+
13
+ @dataclass
14
+ class MIResult:
15
+ """Combined result from multiple imputation using Rubin's rules."""
16
+
17
+ model_type: str
18
+ formula: str
19
+ m: int # number of imputations
20
+ params: dict[str, float]
21
+ std_errors: dict[str, float]
22
+ t_values: dict[str, float]
23
+ p_values: dict[str, float]
24
+ conf_int_low: dict[str, float]
25
+ conf_int_high: dict[str, float]
26
+ n_obs: int
27
+ within_var: dict[str, float] # U_bar
28
+ between_var: dict[str, float] # B
29
+ fmi: dict[str, float] # fraction of missing information
30
+
31
+
32
+ def _initial_fill(series: np.ndarray, rng: np.random.Generator) -> np.ndarray:
33
+ """Initialize missing values by sampling from observed."""
34
+ result = series.copy()
35
+ mask = np.isnan(result)
36
+ observed = result[~mask]
37
+ if len(observed) > 0 and mask.any():
38
+ result[mask] = rng.choice(observed, size=mask.sum())
39
+ return result
40
+
41
+
42
+ def _impute_regression(data: np.ndarray, col_idx: int, rng: np.random.Generator) -> None:
43
+ """Impute a continuous variable using Bayesian linear regression."""
44
+ mask = np.isnan(data[:, col_idx])
45
+ if not mask.any():
46
+ return
47
+
48
+ obs_idx = ~mask
49
+ predictors = np.delete(data, col_idx, axis=1)
50
+
51
+ y_obs = data[obs_idx, col_idx]
52
+ X_obs = sm.add_constant(predictors[obs_idx])
53
+
54
+ try:
55
+ model = sm.OLS(y_obs, X_obs).fit()
56
+ # Draw from posterior (proper imputation)
57
+ beta_hat = model.params
58
+ sigma2 = model.scale
59
+ # Draw sigma from scaled inverse chi-squared
60
+ n = len(y_obs)
61
+ k = len(beta_hat)
62
+ sigma2_draw = sigma2 * (n - k) / rng.chisquare(n - k)
63
+ # Draw beta from multivariate normal
64
+ cov = model.cov_params() * sigma2_draw / sigma2
65
+ beta_draw = rng.multivariate_normal(beta_hat, cov)
66
+
67
+ # Predict missing
68
+ X_miss = sm.add_constant(predictors[mask])
69
+ y_pred = X_miss @ beta_draw
70
+ y_pred += rng.normal(0, np.sqrt(sigma2_draw), size=len(y_pred))
71
+ data[mask, col_idx] = y_pred
72
+ except Exception:
73
+ # Fallback: simple mean imputation
74
+ data[mask, col_idx] = np.nanmean(data[:, col_idx])
75
+
76
+
77
+ def _impute_logit(data: np.ndarray, col_idx: int, rng: np.random.Generator) -> None:
78
+ """Impute a binary variable using logistic regression."""
79
+ mask = np.isnan(data[:, col_idx])
80
+ if not mask.any():
81
+ return
82
+
83
+ obs_idx = ~mask
84
+ predictors = np.delete(data, col_idx, axis=1)
85
+
86
+ y_obs = data[obs_idx, col_idx]
87
+ X_obs = sm.add_constant(predictors[obs_idx])
88
+
89
+ try:
90
+ model = sm.Logit(y_obs, X_obs).fit(disp=0)
91
+ beta_hat = model.params
92
+ cov = model.cov_params()
93
+ beta_draw = rng.multivariate_normal(beta_hat, cov)
94
+
95
+ X_miss = sm.add_constant(predictors[mask])
96
+ logits = X_miss @ beta_draw
97
+ probs = 1 / (1 + np.exp(-logits))
98
+ data[mask, col_idx] = (rng.random(size=len(probs)) < probs).astype(float)
99
+ except Exception:
100
+ data[mask, col_idx] = np.round(np.nanmean(data[:, col_idx]))
101
+
102
+
103
+ def _impute_pmm(data: np.ndarray, col_idx: int, rng: np.random.Generator, k: int = 5) -> None:
104
+ """Impute using Predictive Mean Matching."""
105
+ mask = np.isnan(data[:, col_idx])
106
+ if not mask.any():
107
+ return
108
+
109
+ obs_idx = ~mask
110
+ predictors = np.delete(data, col_idx, axis=1)
111
+
112
+ y_obs = data[obs_idx, col_idx]
113
+ X_obs = sm.add_constant(predictors[obs_idx])
114
+
115
+ try:
116
+ model = sm.OLS(y_obs, X_obs).fit()
117
+ y_hat_obs = model.predict(X_obs)
118
+
119
+ beta_draw = rng.multivariate_normal(model.params, model.cov_params())
120
+ X_miss = sm.add_constant(predictors[mask])
121
+ y_hat_miss = X_miss @ beta_draw
122
+
123
+ # For each missing, find k nearest donors and sample
124
+ for i, pred in enumerate(y_hat_miss):
125
+ distances = np.abs(y_hat_obs - pred)
126
+ donor_indices = np.argsort(distances)[:k]
127
+ chosen = rng.choice(donor_indices)
128
+ data[mask, col_idx] = y_obs.iloc[chosen] if hasattr(y_obs, 'iloc') else y_obs[chosen]
129
+ except Exception:
130
+ data[mask, col_idx] = np.nanmean(data[:, col_idx])
131
+
132
+
133
+ def mice_impute(
134
+ df: pl.DataFrame,
135
+ specs: list[tuple[str, str]],
136
+ m: int = 5,
137
+ max_iter: int = 10,
138
+ seed: int = 42,
139
+ ) -> list[pl.DataFrame]:
140
+ """Run MICE (Multiple Imputation by Chained Equations).
141
+
142
+ Args:
143
+ df: Input DataFrame with missing values.
144
+ specs: List of (method, column) tuples.
145
+ method: "regress", "logit", "pmm"
146
+ m: Number of imputed datasets.
147
+ max_iter: Number of MICE iterations.
148
+ seed: Random seed.
149
+
150
+ Returns list of m imputed DataFrames.
151
+ """
152
+ rng = np.random.default_rng(seed)
153
+ col_names = [col for _, col in specs]
154
+ other_cols = [c for c in df.columns if c not in col_names]
155
+ all_cols = col_names + other_cols
156
+
157
+ imputed_datasets: list[pl.DataFrame] = []
158
+
159
+ for imp in range(m):
160
+ # Convert to numpy for fast computation
161
+ data = df.select(all_cols).to_numpy().astype(float)
162
+ n_imp_cols = len(col_names)
163
+
164
+ # Initialize missing with random draws from observed
165
+ for i in range(n_imp_cols):
166
+ data[:, i] = _initial_fill(data[:, i], rng)
167
+
168
+ # Iterate chained equations
169
+ for _ in range(max_iter):
170
+ for i, (method, _col) in enumerate(specs):
171
+ # Temporarily set imputed values back to NaN for this col
172
+ orig = df[_col].to_numpy().astype(float)
173
+ was_missing = np.isnan(orig)
174
+ save = data[was_missing, i].copy()
175
+ data[was_missing, i] = np.nan
176
+
177
+ if method == "regress":
178
+ _impute_regression(data, i, rng)
179
+ elif method == "logit":
180
+ _impute_logit(data, i, rng)
181
+ elif method == "pmm":
182
+ _impute_pmm(data, i, rng)
183
+ else:
184
+ # Default to regression
185
+ _impute_regression(data, i, rng)
186
+
187
+ # Convert back to Polars
188
+ imputed_df = pl.DataFrame({
189
+ col: data[:, i] for i, col in enumerate(all_cols)
190
+ })
191
+ imputed_datasets.append(imputed_df)
192
+
193
+ return imputed_datasets
194
+
195
+
196
+ def rubins_rules(
197
+ estimates: list[dict[str, float]],
198
+ std_errors: list[dict[str, float]],
199
+ n_obs: int,
200
+ ) -> MIResult:
201
+ """Combine estimates from m imputed datasets using Rubin's rules.
202
+
203
+ Args:
204
+ estimates: List of param dicts from each imputed dataset.
205
+ std_errors: List of SE dicts from each imputed dataset.
206
+ n_obs: Number of observations.
207
+
208
+ Returns MIResult with combined estimates.
209
+ """
210
+ m = len(estimates)
211
+ var_names = list(estimates[0].keys())
212
+
213
+ combined_params: dict[str, float] = {}
214
+ combined_se: dict[str, float] = {}
215
+ combined_t: dict[str, float] = {}
216
+ combined_p: dict[str, float] = {}
217
+ combined_ci_low: dict[str, float] = {}
218
+ combined_ci_high: dict[str, float] = {}
219
+ within_var: dict[str, float] = {}
220
+ between_var: dict[str, float] = {}
221
+ fmi_dict: dict[str, float] = {}
222
+
223
+ for var in var_names:
224
+ # Point estimate: average across imputations
225
+ q_vals = np.array([est[var] for est in estimates])
226
+ q_bar = float(np.mean(q_vals))
227
+
228
+ # Within-imputation variance
229
+ u_vals = np.array([se[var] ** 2 for se in std_errors])
230
+ u_bar = float(np.mean(u_vals))
231
+
232
+ # Between-imputation variance
233
+ b = float(np.var(q_vals, ddof=1))
234
+
235
+ # Total variance
236
+ t = u_bar + (1 + 1 / m) * b
237
+
238
+ # Degrees of freedom (Barnard-Rubin)
239
+ if b > 0 and u_bar > 0:
240
+ r = (1 + 1 / m) * b / u_bar
241
+ df_old = (m - 1) * (1 + 1 / r) ** 2
242
+ df_obs = (n_obs - len(var_names) + 1) / (n_obs - len(var_names) + 3) * (n_obs - len(var_names)) * (1 - r)
243
+ if df_obs > 0:
244
+ df = (df_old * df_obs) / (df_old + df_obs)
245
+ else:
246
+ df = df_old
247
+ fmi = (r + 2 / (df + 3)) / (r + 1)
248
+ else:
249
+ df = max(n_obs - len(var_names), 1)
250
+ fmi = 0.0
251
+
252
+ se = np.sqrt(t)
253
+ t_val = q_bar / se if se > 0 else 0.0
254
+ p_val = float(2 * (1 - sp_stats.t.cdf(abs(t_val), df))) if df > 0 else 1.0
255
+ ci_low = q_bar - 1.96 * se
256
+ ci_high = q_bar + 1.96 * se
257
+
258
+ combined_params[var] = q_bar
259
+ combined_se[var] = float(se)
260
+ combined_t[var] = t_val
261
+ combined_p[var] = p_val
262
+ combined_ci_low[var] = ci_low
263
+ combined_ci_high[var] = ci_high
264
+ within_var[var] = u_bar
265
+ between_var[var] = b
266
+ fmi_dict[var] = fmi
267
+
268
+ return MIResult(
269
+ model_type="MI",
270
+ formula="",
271
+ m=m,
272
+ params=combined_params,
273
+ std_errors=combined_se,
274
+ t_values=combined_t,
275
+ p_values=combined_p,
276
+ conf_int_low=combined_ci_low,
277
+ conf_int_high=combined_ci_high,
278
+ n_obs=n_obs,
279
+ within_var=within_var,
280
+ between_var=between_var,
281
+ fmi=fmi_dict,
282
+ )
@@ -0,0 +1,78 @@
1
+ """Influence diagnostics: leverage, Cook's D, DFBETAs, outlier detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+
9
+ def compute_influence(df: pl.DataFrame, dep: str, indeps: list[str]) -> dict:
10
+ """Compute OLS influence statistics: leverage, Cook's D, DFBETAs, studentized residuals."""
11
+ sub = df.select([dep] + indeps).drop_nulls()
12
+ y = sub[dep].to_numpy().astype(float)
13
+ X_raw = sub.select(indeps).to_numpy().astype(float)
14
+ n, k = X_raw.shape
15
+ X = np.column_stack([np.ones(n), X_raw])
16
+ kp = k + 1
17
+
18
+ # OLS fit
19
+ XtX_inv = np.linalg.pinv(X.T @ X)
20
+ beta = XtX_inv @ X.T @ y
21
+ y_hat = X @ beta
22
+ resid = y - y_hat
23
+ mse = (resid @ resid) / (n - kp)
24
+
25
+ # Hat matrix diagonal (leverage)
26
+ H = X @ XtX_inv @ X.T
27
+ leverage = np.diag(H)
28
+
29
+ # Studentized residuals (internal)
30
+ sigma = np.sqrt(mse)
31
+ std_resid = resid / (sigma * np.sqrt(1 - leverage + 1e-10))
32
+
33
+ # Cook's Distance
34
+ cooks_d = (std_resid ** 2 * leverage) / (kp * (1 - leverage + 1e-10))
35
+
36
+ # DFBETAs per coefficient
37
+ dfbetas = {}
38
+ se_beta = np.sqrt(mse * np.diag(XtX_inv))
39
+ for j in range(kp):
40
+ name = "_cons" if j == 0 else indeps[j - 1]
41
+ # Approximation: dfbeta_j = h_j * resid_j / (se_beta_j * (1-lev_j))
42
+ dfb = (X[:, j] * resid) / ((n - kp - 1) * se_beta[j] + 1e-10)
43
+ dfbetas[name] = dfb.tolist()
44
+
45
+ # Mahalanobis distance on X for outlier detection
46
+ X_centered = X_raw - X_raw.mean(axis=0)
47
+ try:
48
+ cov_inv = np.linalg.pinv(np.cov(X_raw.T))
49
+ mahal = np.array([float(x @ cov_inv @ x) for x in X_centered])
50
+ except Exception:
51
+ mahal = np.zeros(n)
52
+
53
+ return {
54
+ "n_obs": n,
55
+ "n_params": kp,
56
+ "leverage": leverage.tolist(),
57
+ "cooks_d": cooks_d.tolist(),
58
+ "std_residuals": std_resid.tolist(),
59
+ "mahalanobis": mahal.tolist(),
60
+ "dfbetas": dfbetas,
61
+ "high_leverage_threshold": 2 * kp / n,
62
+ "high_cooks_threshold": 4 / n,
63
+ "n_high_leverage": int((leverage > 2 * kp / n).sum()),
64
+ "n_high_cooks": int((cooks_d > 4 / n).sum()),
65
+ }
66
+
67
+
68
+ def detect_outliers(df: pl.DataFrame, dep: str, indeps: list[str], threshold: float = 3.0) -> dict:
69
+ """Identify outliers by studentized residuals > threshold."""
70
+ inf = compute_influence(df, dep, indeps)
71
+ std_resid = np.array(inf["std_residuals"])
72
+ outlier_idx = np.where(np.abs(std_resid) > threshold)[0].tolist()
73
+ return {
74
+ "outlier_indices": outlier_idx,
75
+ "n_outliers": len(outlier_idx),
76
+ "threshold": threshold,
77
+ "std_residuals": inf["std_residuals"],
78
+ }
openstat/stats/iv.py ADDED
@@ -0,0 +1,131 @@
1
+ """Instrumental variables: 2SLS, first-stage diagnostics, overidentification tests."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+ import statsmodels.api as sm
8
+
9
+ from openstat.stats.models import FitResult
10
+
11
+
12
+ def _try_import_linearmodels():
13
+ try:
14
+ import linearmodels # noqa: F401
15
+ except ImportError:
16
+ raise ImportError(
17
+ "IV models require linearmodels. "
18
+ "Install it with: pip install openstat[panel]"
19
+ )
20
+
21
+
22
+ def _iv_to_fit_result(result, dep: str, exog: list[str], endog: list[str], instruments: list[str]) -> FitResult:
23
+ """Convert linearmodels IVResults to FitResult."""
24
+ all_vars = list(result.params.index)
25
+ params = {name: float(val) for name, val in result.params.items()}
26
+ std_errors = {name: float(val) for name, val in result.std_errors.items()}
27
+ t_values = {name: float(val) for name, val in result.tstats.items()}
28
+ p_values = {name: float(val) for name, val in result.pvalues.items()}
29
+ ci = result.conf_int()
30
+ conf_low = {name: float(ci.loc[name, "lower"]) for name in all_vars}
31
+ conf_high = {name: float(ci.loc[name, "upper"]) for name in all_vars}
32
+
33
+ warnings_list: list[str] = []
34
+ warnings_list.append(f"Endogenous: {', '.join(endog)}")
35
+ warnings_list.append(f"Instruments: {', '.join(instruments)}")
36
+
37
+ return FitResult(
38
+ model_type="IV-2SLS",
39
+ formula=f"{dep} ~ {' + '.join(exog)} + ({' + '.join(endog)} = {' + '.join(instruments)})",
40
+ dep_var=dep,
41
+ indep_vars=all_vars,
42
+ n_obs=int(result.nobs),
43
+ params=params,
44
+ std_errors=std_errors,
45
+ t_values=t_values,
46
+ p_values=p_values,
47
+ conf_int_low=conf_low,
48
+ conf_int_high=conf_high,
49
+ r_squared=float(result.rsquared) if hasattr(result, "rsquared") else None,
50
+ f_statistic=float(result.f_statistic.stat) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
51
+ f_pvalue=float(result.f_statistic.pval) if hasattr(result, "f_statistic") and result.f_statistic is not None else None,
52
+ warnings=warnings_list,
53
+ )
54
+
55
+
56
+ def fit_iv_2sls(
57
+ df: pl.DataFrame,
58
+ dep: str,
59
+ exog: list[str],
60
+ endog: list[str],
61
+ instruments: list[str],
62
+ robust: bool = False,
63
+ ) -> tuple[FitResult, object]:
64
+ """Fit an IV model via Two-Stage Least Squares."""
65
+ _try_import_linearmodels()
66
+ from linearmodels.iv import IV2SLS
67
+
68
+ all_cols = [dep] + exog + endog + instruments
69
+ pdf = df.select(all_cols).to_pandas().dropna()
70
+
71
+ dep_data = pdf[dep]
72
+ exog_data = sm.add_constant(pdf[exog]) if exog else sm.add_constant(pdf[[]])
73
+ endog_data = pdf[endog]
74
+ instr_data = pdf[instruments]
75
+
76
+ model = IV2SLS(dep_data, exog_data, endog_data, instr_data)
77
+ cov_type = "robust" if robust else "unadjusted"
78
+ result = model.fit(cov_type=cov_type)
79
+
80
+ fit = _iv_to_fit_result(result, dep, ["const"] + exog, endog, instruments)
81
+ return fit, result
82
+
83
+
84
+ def first_stage_diagnostics(iv_result) -> str:
85
+ """Report first-stage regression diagnostics."""
86
+ lines = ["First-Stage Diagnostics:"]
87
+ try:
88
+ fs = iv_result.first_stage
89
+ for endog_var in fs.diagnostics:
90
+ diag = fs.diagnostics[endog_var]
91
+ lines.append(f"\n Endogenous: {endog_var}")
92
+ lines.append(f" Partial R²: {diag.rsquared:.4f}")
93
+ lines.append(f" Partial F-stat: {diag.f_stat.stat:.2f} (p={diag.f_stat.pval:.4f})")
94
+ if diag.f_stat.stat < 10:
95
+ lines.append(" ⚠ Weak instruments (F < 10)")
96
+ except Exception as e:
97
+ lines.append(f" Could not compute: {e}")
98
+ return "\n".join(lines)
99
+
100
+
101
+ def overidentification_test(iv_result) -> str:
102
+ """Sargan/Hansen J-test for overidentifying restrictions."""
103
+ lines = ["Overidentification Test (Sargan/Hansen J):"]
104
+ try:
105
+ j_test = iv_result.sargan
106
+ lines.append(f" J-statistic: {j_test.stat:.4f}")
107
+ lines.append(f" p-value: {j_test.pval:.4f}")
108
+ lines.append(f" df: {j_test.df}")
109
+ if j_test.pval < 0.05:
110
+ lines.append(" ⚠ Reject H0: instruments may not be valid")
111
+ else:
112
+ lines.append(" ✓ Cannot reject H0: instruments appear valid")
113
+ except Exception as e:
114
+ lines.append(f" Not available (exactly identified or error: {e})")
115
+ return "\n".join(lines)
116
+
117
+
118
+ def endogeneity_test(iv_result) -> str:
119
+ """Durbin-Wu-Hausman test for endogeneity."""
120
+ lines = ["Endogeneity Test (Durbin-Wu-Hausman):"]
121
+ try:
122
+ wu_test = iv_result.wu_hausman()
123
+ lines.append(f" Statistic: {wu_test.stat:.4f}")
124
+ lines.append(f" p-value: {wu_test.pval:.4f}")
125
+ if wu_test.pval < 0.05:
126
+ lines.append(" ⚠ Reject H0: endogeneity detected — IV is appropriate")
127
+ else:
128
+ lines.append(" ✓ Cannot reject exogeneity — OLS may be sufficient")
129
+ except Exception as e:
130
+ lines.append(f" Could not compute: {e}")
131
+ return "\n".join(lines)