PyPI - openstat-cli - Versions diffs - 1.0.0__py3-none-any.whl - Mend

openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

openstat/__init__.py +3 -0
openstat/__main__.py +4 -0
openstat/backends/__init__.py +16 -0
openstat/backends/duckdb_backend.py +70 -0
openstat/backends/polars_backend.py +52 -0
openstat/cli.py +92 -0
openstat/commands/__init__.py +82 -0
openstat/commands/adv_stat_cmds.py +1255 -0
openstat/commands/advanced_ml_cmds.py +576 -0
openstat/commands/advreg_cmds.py +207 -0
openstat/commands/alias_cmds.py +135 -0
openstat/commands/arch_cmds.py +82 -0
openstat/commands/arules_cmds.py +111 -0
openstat/commands/automodel_cmds.py +212 -0
openstat/commands/backend_cmds.py +82 -0
openstat/commands/base.py +170 -0
openstat/commands/bayes_cmds.py +71 -0
openstat/commands/causal_cmds.py +269 -0
openstat/commands/cluster_cmds.py +152 -0
openstat/commands/data_cmds.py +996 -0
openstat/commands/datamanip_cmds.py +672 -0
openstat/commands/dataquality_cmds.py +174 -0
openstat/commands/datetime_cmds.py +176 -0
openstat/commands/dimreduce_cmds.py +184 -0
openstat/commands/discrete_cmds.py +149 -0
openstat/commands/dsl_cmds.py +143 -0
openstat/commands/epi_cmds.py +93 -0
openstat/commands/equiv_tobit_cmds.py +94 -0
openstat/commands/esttab_cmds.py +196 -0
openstat/commands/export_beamer_cmds.py +142 -0
openstat/commands/export_cmds.py +201 -0
openstat/commands/export_extra_cmds.py +240 -0
openstat/commands/factor_cmds.py +180 -0
openstat/commands/groupby_cmds.py +155 -0
openstat/commands/help_cmds.py +237 -0
openstat/commands/i18n_cmds.py +43 -0
openstat/commands/import_extra_cmds.py +561 -0
openstat/commands/influence_cmds.py +134 -0
openstat/commands/iv_cmds.py +106 -0
openstat/commands/manova_cmds.py +105 -0
openstat/commands/mediate_cmds.py +233 -0
openstat/commands/meta_cmds.py +284 -0
openstat/commands/mi_cmds.py +228 -0
openstat/commands/mixed_cmds.py +79 -0
openstat/commands/mixture_changepoint_cmds.py +166 -0
openstat/commands/ml_adv_cmds.py +147 -0
openstat/commands/ml_cmds.py +178 -0
openstat/commands/model_eval_cmds.py +142 -0
openstat/commands/network_cmds.py +288 -0
openstat/commands/nlquery_cmds.py +161 -0
openstat/commands/nonparam_cmds.py +149 -0
openstat/commands/outreg_cmds.py +247 -0
openstat/commands/panel_cmds.py +141 -0
openstat/commands/pdf_cmds.py +226 -0
openstat/commands/pipeline_cmds.py +319 -0
openstat/commands/plot_cmds.py +189 -0
openstat/commands/plugin_cmds.py +79 -0
openstat/commands/posthoc_cmds.py +153 -0
openstat/commands/power_cmds.py +172 -0
openstat/commands/profile_cmds.py +246 -0
openstat/commands/rbridge_cmds.py +81 -0
openstat/commands/regex_cmds.py +104 -0
openstat/commands/report_cmds.py +48 -0
openstat/commands/repro_cmds.py +129 -0
openstat/commands/resampling_cmds.py +109 -0
openstat/commands/reshape_cmds.py +223 -0
openstat/commands/sem_cmds.py +177 -0
openstat/commands/stat_cmds.py +1040 -0
openstat/commands/stata_import_cmds.py +215 -0
openstat/commands/string_cmds.py +124 -0
openstat/commands/surv_cmds.py +145 -0
openstat/commands/survey_cmds.py +153 -0
openstat/commands/textanalysis_cmds.py +192 -0
openstat/commands/ts_adv_cmds.py +136 -0
openstat/commands/ts_cmds.py +195 -0
openstat/commands/tui_cmds.py +111 -0
openstat/commands/ux_cmds.py +191 -0
openstat/commands/validate_cmds.py +270 -0
openstat/commands/viz_adv_cmds.py +312 -0
openstat/commands/viz_extra_cmds.py +251 -0
openstat/commands/watch_cmds.py +69 -0
openstat/config.py +106 -0
openstat/dsl/__init__.py +0 -0
openstat/dsl/parser.py +332 -0
openstat/dsl/tokenizer.py +105 -0
openstat/i18n.py +120 -0
openstat/io/__init__.py +0 -0
openstat/io/loader.py +187 -0
openstat/jupyter/__init__.py +18 -0
openstat/jupyter/display.py +18 -0
openstat/jupyter/magic.py +60 -0
openstat/logging_config.py +59 -0
openstat/plots/__init__.py +0 -0
openstat/plots/plotter.py +437 -0
openstat/plots/surv_plots.py +32 -0
openstat/plots/ts_plots.py +59 -0
openstat/plugins/__init__.py +5 -0
openstat/plugins/manager.py +69 -0
openstat/repl.py +457 -0
openstat/reporting/__init__.py +0 -0
openstat/reporting/eda.py +208 -0
openstat/reporting/report.py +67 -0
openstat/script_runner.py +319 -0
openstat/session.py +133 -0
openstat/stats/__init__.py +0 -0
openstat/stats/advanced_regression.py +269 -0
openstat/stats/arch_garch.py +84 -0
openstat/stats/bayesian.py +103 -0
openstat/stats/causal.py +258 -0
openstat/stats/clustering.py +206 -0
openstat/stats/discrete.py +311 -0
openstat/stats/epidemiology.py +119 -0
openstat/stats/equiv_tobit.py +163 -0
openstat/stats/factor.py +174 -0
openstat/stats/imputation.py +282 -0
openstat/stats/influence.py +78 -0
openstat/stats/iv.py +131 -0
openstat/stats/manova.py +124 -0
openstat/stats/mixed.py +128 -0
openstat/stats/ml.py +275 -0
openstat/stats/ml_advanced.py +117 -0
openstat/stats/model_eval.py +183 -0
openstat/stats/models.py +1342 -0
openstat/stats/nonparametric.py +130 -0
openstat/stats/panel.py +179 -0
openstat/stats/power.py +295 -0
openstat/stats/resampling.py +203 -0
openstat/stats/survey.py +213 -0
openstat/stats/survival.py +196 -0
openstat/stats/timeseries.py +142 -0
openstat/stats/ts_advanced.py +114 -0
openstat/types.py +11 -0
openstat/web/__init__.py +1 -0
openstat/web/app.py +117 -0
openstat/web/session_manager.py +73 -0
openstat/web/static/app.js +117 -0
openstat/web/static/index.html +38 -0
openstat/web/static/style.css +103 -0
openstat_cli-1.0.0.dist-info/METADATA +748 -0
openstat_cli-1.0.0.dist-info/RECORD +143 -0
openstat_cli-1.0.0.dist-info/WHEEL +4 -0
openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0

openstat/stats/model_eval.py ADDED Viewed

@@ -0,0 +1,183 @@
+"""Model evaluation: ROC/AUC, confusion matrix, calibration, SHAP approximation."""
+from __future__ import annotations
+import numpy as np
+import polars as pl
+def roc_auc(
+    df: pl.DataFrame,
+    outcome: str,
+    score: str,
+) -> dict:
+    """Compute ROC curve and AUC (trapezoidal rule)."""
+    sub = df.select([outcome, score]).drop_nulls()
+    y_true = sub[outcome].to_numpy().astype(int)
+    y_score = sub[score].to_numpy().astype(float)
+    thresholds = np.sort(np.unique(y_score))[::-1]
+    tpr_list = []
+    fpr_list = []
+    for t in thresholds:
+        y_pred = (y_score >= t).astype(int)
+        tp = int(((y_pred == 1) & (y_true == 1)).sum())
+        fp = int(((y_pred == 1) & (y_true == 0)).sum())
+        fn = int(((y_pred == 0) & (y_true == 1)).sum())
+        tn = int(((y_pred == 0) & (y_true == 0)).sum())
+        tpr_list.append(tp / max(tp + fn, 1))
+        fpr_list.append(fp / max(fp + tn, 1))
+    fpr_arr = np.array([0.0] + fpr_list + [1.0])
+    tpr_arr = np.array([0.0] + tpr_list + [1.0])
+    auc = float(np.trapezoid(tpr_arr, fpr_arr))
+    # Youden J statistic → optimal threshold
+    j = tpr_arr - fpr_arr
+    opt_idx = int(np.argmax(j))
+    opt_threshold = float(thresholds[max(opt_idx - 1, 0)]) if opt_idx > 0 else float(thresholds[0])
+    return {
+        "test": "ROC / AUC",
+        "outcome": outcome,
+        "score": score,
+        "auc": auc,
+        "fpr": fpr_arr.tolist(),
+        "tpr": tpr_arr.tolist(),
+        "thresholds": thresholds.tolist(),
+        "optimal_threshold": opt_threshold,
+        "n_obs": len(y_true),
+        "prevalence": float(y_true.mean()),
+    }
+def confusion_matrix(
+    df: pl.DataFrame,
+    outcome: str,
+    predicted: str,
+    threshold: float = 0.5,
+) -> dict:
+    """Compute confusion matrix and classification metrics."""
+    sub = df.select([outcome, predicted]).drop_nulls()
+    y_true = sub[outcome].to_numpy().astype(int)
+    y_score = sub[predicted].to_numpy().astype(float)
+    # If predicted is already binary (0/1), don't threshold
+    if set(np.unique(y_score)).issubset({0, 1, 0.0, 1.0}):
+        y_pred = y_score.astype(int)
+    else:
+        y_pred = (y_score >= threshold).astype(int)
+    tp = int(((y_pred == 1) & (y_true == 1)).sum())
+    fp = int(((y_pred == 1) & (y_true == 0)).sum())
+    fn = int(((y_pred == 0) & (y_true == 1)).sum())
+    tn = int(((y_pred == 0) & (y_true == 0)).sum())
+    accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
+    precision = tp / max(tp + fp, 1)
+    recall = tp / max(tp + fn, 1)
+    specificity = tn / max(tn + fp, 1)
+    f1 = 2 * precision * recall / max(precision + recall, 1e-10)
+    npv = tn / max(tn + fn, 1)
+    mcc_num = tp * tn - fp * fn
+    mcc_den = np.sqrt(max((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn), 1))
+    mcc = float(mcc_num / mcc_den)
+    return {
+        "test": "Confusion Matrix",
+        "outcome": outcome, "predicted": predicted,
+        "threshold": threshold,
+        "tp": tp, "fp": fp, "fn": fn, "tn": tn,
+        "accuracy": float(accuracy),
+        "precision": float(precision),
+        "recall_sensitivity": float(recall),
+        "specificity": float(specificity),
+        "f1_score": float(f1),
+        "npv": float(npv),
+        "mcc": mcc,
+        "n_obs": len(y_true),
+    }
+def calibration_curve(
+    df: pl.DataFrame,
+    outcome: str,
+    score: str,
+    n_bins: int = 10,
+) -> dict:
+    """Calibration curve (reliability diagram) + Brier score."""
+    sub = df.select([outcome, score]).drop_nulls()
+    y_true = sub[outcome].to_numpy().astype(int)
+    y_score = sub[score].to_numpy().astype(float)
+    brier = float(np.mean((y_score - y_true) ** 2))
+    bins = np.linspace(0, 1, n_bins + 1)
+    bin_centers = []
+    mean_predicted = []
+    fraction_positive = []
+    for i in range(n_bins):
+        mask = (y_score >= bins[i]) & (y_score < bins[i + 1])
+        if i == n_bins - 1:
+            mask = (y_score >= bins[i]) & (y_score <= bins[i + 1])
+        if mask.sum() > 0:
+            bin_centers.append(float((bins[i] + bins[i + 1]) / 2))
+            mean_predicted.append(float(y_score[mask].mean()))
+            fraction_positive.append(float(y_true[mask].mean()))
+    # Hosmer-Lemeshow test approximation
+    expected = np.array(mean_predicted) * np.array([
+        int(((y_score >= bins[i]) & (y_score < bins[i + 1])).sum())
+        for i in range(n_bins)
+        if ((y_score >= bins[i]) & (y_score < bins[i + 1])).sum() > 0
+    ])
+    return {
+        "test": "Calibration Curve",
+        "outcome": outcome, "score": score,
+        "brier_score": brier,
+        "n_bins": n_bins,
+        "bin_centers": bin_centers,
+        "mean_predicted": mean_predicted,
+        "fraction_positive": fraction_positive,
+        "n_obs": len(y_true),
+    }
+def compute_shap_linear(
+    df: pl.DataFrame,
+    dep: str,
+    indeps: list[str],
+) -> dict:
+    """
+    Linear SHAP values for OLS regression.
+    SHAP_i(x) = beta_i * (x_i - E[x_i])
+    Exact for linear models.
+    """
+    sub = df.select([dep] + indeps).drop_nulls()
+    y = sub[dep].to_numpy().astype(float)
+    X_raw = sub.select(indeps).to_numpy().astype(float)
+    n, k = X_raw.shape
+    X = np.column_stack([np.ones(n), X_raw])
+    beta = np.linalg.lstsq(X, y, rcond=None)[0]
+    x_means = X_raw.mean(axis=0)
+    # SHAP values per observation per feature
+    shap_vals = (X_raw - x_means) * beta[1:]  # shape (n, k)
+    mean_abs_shap = {col: float(np.abs(shap_vals[:, i]).mean()) for i, col in enumerate(indeps)}
+    sorted_imp = sorted(mean_abs_shap.items(), key=lambda x: -x[1])
+    return {
+        "method": "Linear SHAP",
+        "dep": dep,
+        "indeps": indeps,
+        "n_obs": n,
+        "mean_abs_shap": mean_abs_shap,
+        "feature_ranking": [f for f, _ in sorted_imp],
+        "shap_values": shap_vals.tolist(),
+        "coefficients": {col: float(beta[i + 1]) for i, col in enumerate(indeps)},
+        "intercept": float(beta[0]),
+    }