openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. openstat/__init__.py +3 -0
  2. openstat/__main__.py +4 -0
  3. openstat/backends/__init__.py +16 -0
  4. openstat/backends/duckdb_backend.py +70 -0
  5. openstat/backends/polars_backend.py +52 -0
  6. openstat/cli.py +92 -0
  7. openstat/commands/__init__.py +82 -0
  8. openstat/commands/adv_stat_cmds.py +1255 -0
  9. openstat/commands/advanced_ml_cmds.py +576 -0
  10. openstat/commands/advreg_cmds.py +207 -0
  11. openstat/commands/alias_cmds.py +135 -0
  12. openstat/commands/arch_cmds.py +82 -0
  13. openstat/commands/arules_cmds.py +111 -0
  14. openstat/commands/automodel_cmds.py +212 -0
  15. openstat/commands/backend_cmds.py +82 -0
  16. openstat/commands/base.py +170 -0
  17. openstat/commands/bayes_cmds.py +71 -0
  18. openstat/commands/causal_cmds.py +269 -0
  19. openstat/commands/cluster_cmds.py +152 -0
  20. openstat/commands/data_cmds.py +996 -0
  21. openstat/commands/datamanip_cmds.py +672 -0
  22. openstat/commands/dataquality_cmds.py +174 -0
  23. openstat/commands/datetime_cmds.py +176 -0
  24. openstat/commands/dimreduce_cmds.py +184 -0
  25. openstat/commands/discrete_cmds.py +149 -0
  26. openstat/commands/dsl_cmds.py +143 -0
  27. openstat/commands/epi_cmds.py +93 -0
  28. openstat/commands/equiv_tobit_cmds.py +94 -0
  29. openstat/commands/esttab_cmds.py +196 -0
  30. openstat/commands/export_beamer_cmds.py +142 -0
  31. openstat/commands/export_cmds.py +201 -0
  32. openstat/commands/export_extra_cmds.py +240 -0
  33. openstat/commands/factor_cmds.py +180 -0
  34. openstat/commands/groupby_cmds.py +155 -0
  35. openstat/commands/help_cmds.py +237 -0
  36. openstat/commands/i18n_cmds.py +43 -0
  37. openstat/commands/import_extra_cmds.py +561 -0
  38. openstat/commands/influence_cmds.py +134 -0
  39. openstat/commands/iv_cmds.py +106 -0
  40. openstat/commands/manova_cmds.py +105 -0
  41. openstat/commands/mediate_cmds.py +233 -0
  42. openstat/commands/meta_cmds.py +284 -0
  43. openstat/commands/mi_cmds.py +228 -0
  44. openstat/commands/mixed_cmds.py +79 -0
  45. openstat/commands/mixture_changepoint_cmds.py +166 -0
  46. openstat/commands/ml_adv_cmds.py +147 -0
  47. openstat/commands/ml_cmds.py +178 -0
  48. openstat/commands/model_eval_cmds.py +142 -0
  49. openstat/commands/network_cmds.py +288 -0
  50. openstat/commands/nlquery_cmds.py +161 -0
  51. openstat/commands/nonparam_cmds.py +149 -0
  52. openstat/commands/outreg_cmds.py +247 -0
  53. openstat/commands/panel_cmds.py +141 -0
  54. openstat/commands/pdf_cmds.py +226 -0
  55. openstat/commands/pipeline_cmds.py +319 -0
  56. openstat/commands/plot_cmds.py +189 -0
  57. openstat/commands/plugin_cmds.py +79 -0
  58. openstat/commands/posthoc_cmds.py +153 -0
  59. openstat/commands/power_cmds.py +172 -0
  60. openstat/commands/profile_cmds.py +246 -0
  61. openstat/commands/rbridge_cmds.py +81 -0
  62. openstat/commands/regex_cmds.py +104 -0
  63. openstat/commands/report_cmds.py +48 -0
  64. openstat/commands/repro_cmds.py +129 -0
  65. openstat/commands/resampling_cmds.py +109 -0
  66. openstat/commands/reshape_cmds.py +223 -0
  67. openstat/commands/sem_cmds.py +177 -0
  68. openstat/commands/stat_cmds.py +1040 -0
  69. openstat/commands/stata_import_cmds.py +215 -0
  70. openstat/commands/string_cmds.py +124 -0
  71. openstat/commands/surv_cmds.py +145 -0
  72. openstat/commands/survey_cmds.py +153 -0
  73. openstat/commands/textanalysis_cmds.py +192 -0
  74. openstat/commands/ts_adv_cmds.py +136 -0
  75. openstat/commands/ts_cmds.py +195 -0
  76. openstat/commands/tui_cmds.py +111 -0
  77. openstat/commands/ux_cmds.py +191 -0
  78. openstat/commands/validate_cmds.py +270 -0
  79. openstat/commands/viz_adv_cmds.py +312 -0
  80. openstat/commands/viz_extra_cmds.py +251 -0
  81. openstat/commands/watch_cmds.py +69 -0
  82. openstat/config.py +106 -0
  83. openstat/dsl/__init__.py +0 -0
  84. openstat/dsl/parser.py +332 -0
  85. openstat/dsl/tokenizer.py +105 -0
  86. openstat/i18n.py +120 -0
  87. openstat/io/__init__.py +0 -0
  88. openstat/io/loader.py +187 -0
  89. openstat/jupyter/__init__.py +18 -0
  90. openstat/jupyter/display.py +18 -0
  91. openstat/jupyter/magic.py +60 -0
  92. openstat/logging_config.py +59 -0
  93. openstat/plots/__init__.py +0 -0
  94. openstat/plots/plotter.py +437 -0
  95. openstat/plots/surv_plots.py +32 -0
  96. openstat/plots/ts_plots.py +59 -0
  97. openstat/plugins/__init__.py +5 -0
  98. openstat/plugins/manager.py +69 -0
  99. openstat/repl.py +457 -0
  100. openstat/reporting/__init__.py +0 -0
  101. openstat/reporting/eda.py +208 -0
  102. openstat/reporting/report.py +67 -0
  103. openstat/script_runner.py +319 -0
  104. openstat/session.py +133 -0
  105. openstat/stats/__init__.py +0 -0
  106. openstat/stats/advanced_regression.py +269 -0
  107. openstat/stats/arch_garch.py +84 -0
  108. openstat/stats/bayesian.py +103 -0
  109. openstat/stats/causal.py +258 -0
  110. openstat/stats/clustering.py +206 -0
  111. openstat/stats/discrete.py +311 -0
  112. openstat/stats/epidemiology.py +119 -0
  113. openstat/stats/equiv_tobit.py +163 -0
  114. openstat/stats/factor.py +174 -0
  115. openstat/stats/imputation.py +282 -0
  116. openstat/stats/influence.py +78 -0
  117. openstat/stats/iv.py +131 -0
  118. openstat/stats/manova.py +124 -0
  119. openstat/stats/mixed.py +128 -0
  120. openstat/stats/ml.py +275 -0
  121. openstat/stats/ml_advanced.py +117 -0
  122. openstat/stats/model_eval.py +183 -0
  123. openstat/stats/models.py +1342 -0
  124. openstat/stats/nonparametric.py +130 -0
  125. openstat/stats/panel.py +179 -0
  126. openstat/stats/power.py +295 -0
  127. openstat/stats/resampling.py +203 -0
  128. openstat/stats/survey.py +213 -0
  129. openstat/stats/survival.py +196 -0
  130. openstat/stats/timeseries.py +142 -0
  131. openstat/stats/ts_advanced.py +114 -0
  132. openstat/types.py +11 -0
  133. openstat/web/__init__.py +1 -0
  134. openstat/web/app.py +117 -0
  135. openstat/web/session_manager.py +73 -0
  136. openstat/web/static/app.js +117 -0
  137. openstat/web/static/index.html +38 -0
  138. openstat/web/static/style.css +103 -0
  139. openstat_cli-1.0.0.dist-info/METADATA +748 -0
  140. openstat_cli-1.0.0.dist-info/RECORD +143 -0
  141. openstat_cli-1.0.0.dist-info/WHEEL +4 -0
  142. openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
  143. openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,183 @@
1
+ """Model evaluation: ROC/AUC, confusion matrix, calibration, SHAP approximation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+ import polars as pl
7
+
8
+
9
+ def roc_auc(
10
+ df: pl.DataFrame,
11
+ outcome: str,
12
+ score: str,
13
+ ) -> dict:
14
+ """Compute ROC curve and AUC (trapezoidal rule)."""
15
+ sub = df.select([outcome, score]).drop_nulls()
16
+ y_true = sub[outcome].to_numpy().astype(int)
17
+ y_score = sub[score].to_numpy().astype(float)
18
+
19
+ thresholds = np.sort(np.unique(y_score))[::-1]
20
+ tpr_list = []
21
+ fpr_list = []
22
+ for t in thresholds:
23
+ y_pred = (y_score >= t).astype(int)
24
+ tp = int(((y_pred == 1) & (y_true == 1)).sum())
25
+ fp = int(((y_pred == 1) & (y_true == 0)).sum())
26
+ fn = int(((y_pred == 0) & (y_true == 1)).sum())
27
+ tn = int(((y_pred == 0) & (y_true == 0)).sum())
28
+ tpr_list.append(tp / max(tp + fn, 1))
29
+ fpr_list.append(fp / max(fp + tn, 1))
30
+
31
+ fpr_arr = np.array([0.0] + fpr_list + [1.0])
32
+ tpr_arr = np.array([0.0] + tpr_list + [1.0])
33
+ auc = float(np.trapezoid(tpr_arr, fpr_arr))
34
+
35
+ # Youden J statistic → optimal threshold
36
+ j = tpr_arr - fpr_arr
37
+ opt_idx = int(np.argmax(j))
38
+ opt_threshold = float(thresholds[max(opt_idx - 1, 0)]) if opt_idx > 0 else float(thresholds[0])
39
+
40
+ return {
41
+ "test": "ROC / AUC",
42
+ "outcome": outcome,
43
+ "score": score,
44
+ "auc": auc,
45
+ "fpr": fpr_arr.tolist(),
46
+ "tpr": tpr_arr.tolist(),
47
+ "thresholds": thresholds.tolist(),
48
+ "optimal_threshold": opt_threshold,
49
+ "n_obs": len(y_true),
50
+ "prevalence": float(y_true.mean()),
51
+ }
52
+
53
+
54
+ def confusion_matrix(
55
+ df: pl.DataFrame,
56
+ outcome: str,
57
+ predicted: str,
58
+ threshold: float = 0.5,
59
+ ) -> dict:
60
+ """Compute confusion matrix and classification metrics."""
61
+ sub = df.select([outcome, predicted]).drop_nulls()
62
+ y_true = sub[outcome].to_numpy().astype(int)
63
+ y_score = sub[predicted].to_numpy().astype(float)
64
+
65
+ # If predicted is already binary (0/1), don't threshold
66
+ if set(np.unique(y_score)).issubset({0, 1, 0.0, 1.0}):
67
+ y_pred = y_score.astype(int)
68
+ else:
69
+ y_pred = (y_score >= threshold).astype(int)
70
+
71
+ tp = int(((y_pred == 1) & (y_true == 1)).sum())
72
+ fp = int(((y_pred == 1) & (y_true == 0)).sum())
73
+ fn = int(((y_pred == 0) & (y_true == 1)).sum())
74
+ tn = int(((y_pred == 0) & (y_true == 0)).sum())
75
+
76
+ accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
77
+ precision = tp / max(tp + fp, 1)
78
+ recall = tp / max(tp + fn, 1)
79
+ specificity = tn / max(tn + fp, 1)
80
+ f1 = 2 * precision * recall / max(precision + recall, 1e-10)
81
+ npv = tn / max(tn + fn, 1)
82
+ mcc_num = tp * tn - fp * fn
83
+ mcc_den = np.sqrt(max((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn), 1))
84
+ mcc = float(mcc_num / mcc_den)
85
+
86
+ return {
87
+ "test": "Confusion Matrix",
88
+ "outcome": outcome, "predicted": predicted,
89
+ "threshold": threshold,
90
+ "tp": tp, "fp": fp, "fn": fn, "tn": tn,
91
+ "accuracy": float(accuracy),
92
+ "precision": float(precision),
93
+ "recall_sensitivity": float(recall),
94
+ "specificity": float(specificity),
95
+ "f1_score": float(f1),
96
+ "npv": float(npv),
97
+ "mcc": mcc,
98
+ "n_obs": len(y_true),
99
+ }
100
+
101
+
102
+ def calibration_curve(
103
+ df: pl.DataFrame,
104
+ outcome: str,
105
+ score: str,
106
+ n_bins: int = 10,
107
+ ) -> dict:
108
+ """Calibration curve (reliability diagram) + Brier score."""
109
+ sub = df.select([outcome, score]).drop_nulls()
110
+ y_true = sub[outcome].to_numpy().astype(int)
111
+ y_score = sub[score].to_numpy().astype(float)
112
+
113
+ brier = float(np.mean((y_score - y_true) ** 2))
114
+
115
+ bins = np.linspace(0, 1, n_bins + 1)
116
+ bin_centers = []
117
+ mean_predicted = []
118
+ fraction_positive = []
119
+
120
+ for i in range(n_bins):
121
+ mask = (y_score >= bins[i]) & (y_score < bins[i + 1])
122
+ if i == n_bins - 1:
123
+ mask = (y_score >= bins[i]) & (y_score <= bins[i + 1])
124
+ if mask.sum() > 0:
125
+ bin_centers.append(float((bins[i] + bins[i + 1]) / 2))
126
+ mean_predicted.append(float(y_score[mask].mean()))
127
+ fraction_positive.append(float(y_true[mask].mean()))
128
+
129
+ # Hosmer-Lemeshow test approximation
130
+ expected = np.array(mean_predicted) * np.array([
131
+ int(((y_score >= bins[i]) & (y_score < bins[i + 1])).sum())
132
+ for i in range(n_bins)
133
+ if ((y_score >= bins[i]) & (y_score < bins[i + 1])).sum() > 0
134
+ ])
135
+
136
+ return {
137
+ "test": "Calibration Curve",
138
+ "outcome": outcome, "score": score,
139
+ "brier_score": brier,
140
+ "n_bins": n_bins,
141
+ "bin_centers": bin_centers,
142
+ "mean_predicted": mean_predicted,
143
+ "fraction_positive": fraction_positive,
144
+ "n_obs": len(y_true),
145
+ }
146
+
147
+
148
+ def compute_shap_linear(
149
+ df: pl.DataFrame,
150
+ dep: str,
151
+ indeps: list[str],
152
+ ) -> dict:
153
+ """
154
+ Linear SHAP values for OLS regression.
155
+ SHAP_i(x) = beta_i * (x_i - E[x_i])
156
+ Exact for linear models.
157
+ """
158
+ sub = df.select([dep] + indeps).drop_nulls()
159
+ y = sub[dep].to_numpy().astype(float)
160
+ X_raw = sub.select(indeps).to_numpy().astype(float)
161
+ n, k = X_raw.shape
162
+ X = np.column_stack([np.ones(n), X_raw])
163
+
164
+ beta = np.linalg.lstsq(X, y, rcond=None)[0]
165
+ x_means = X_raw.mean(axis=0)
166
+
167
+ # SHAP values per observation per feature
168
+ shap_vals = (X_raw - x_means) * beta[1:] # shape (n, k)
169
+
170
+ mean_abs_shap = {col: float(np.abs(shap_vals[:, i]).mean()) for i, col in enumerate(indeps)}
171
+ sorted_imp = sorted(mean_abs_shap.items(), key=lambda x: -x[1])
172
+
173
+ return {
174
+ "method": "Linear SHAP",
175
+ "dep": dep,
176
+ "indeps": indeps,
177
+ "n_obs": n,
178
+ "mean_abs_shap": mean_abs_shap,
179
+ "feature_ranking": [f for f, _ in sorted_imp],
180
+ "shap_values": shap_vals.tolist(),
181
+ "coefficients": {col: float(beta[i + 1]) for i, col in enumerate(indeps)},
182
+ "intercept": float(beta[0]),
183
+ }