PyPI - openstat-cli - Versions diffs - 1.0.0__py3-none-any.whl - Mend

openstat-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

openstat/__init__.py +3 -0
openstat/__main__.py +4 -0
openstat/backends/__init__.py +16 -0
openstat/backends/duckdb_backend.py +70 -0
openstat/backends/polars_backend.py +52 -0
openstat/cli.py +92 -0
openstat/commands/__init__.py +82 -0
openstat/commands/adv_stat_cmds.py +1255 -0
openstat/commands/advanced_ml_cmds.py +576 -0
openstat/commands/advreg_cmds.py +207 -0
openstat/commands/alias_cmds.py +135 -0
openstat/commands/arch_cmds.py +82 -0
openstat/commands/arules_cmds.py +111 -0
openstat/commands/automodel_cmds.py +212 -0
openstat/commands/backend_cmds.py +82 -0
openstat/commands/base.py +170 -0
openstat/commands/bayes_cmds.py +71 -0
openstat/commands/causal_cmds.py +269 -0
openstat/commands/cluster_cmds.py +152 -0
openstat/commands/data_cmds.py +996 -0
openstat/commands/datamanip_cmds.py +672 -0
openstat/commands/dataquality_cmds.py +174 -0
openstat/commands/datetime_cmds.py +176 -0
openstat/commands/dimreduce_cmds.py +184 -0
openstat/commands/discrete_cmds.py +149 -0
openstat/commands/dsl_cmds.py +143 -0
openstat/commands/epi_cmds.py +93 -0
openstat/commands/equiv_tobit_cmds.py +94 -0
openstat/commands/esttab_cmds.py +196 -0
openstat/commands/export_beamer_cmds.py +142 -0
openstat/commands/export_cmds.py +201 -0
openstat/commands/export_extra_cmds.py +240 -0
openstat/commands/factor_cmds.py +180 -0
openstat/commands/groupby_cmds.py +155 -0
openstat/commands/help_cmds.py +237 -0
openstat/commands/i18n_cmds.py +43 -0
openstat/commands/import_extra_cmds.py +561 -0
openstat/commands/influence_cmds.py +134 -0
openstat/commands/iv_cmds.py +106 -0
openstat/commands/manova_cmds.py +105 -0
openstat/commands/mediate_cmds.py +233 -0
openstat/commands/meta_cmds.py +284 -0
openstat/commands/mi_cmds.py +228 -0
openstat/commands/mixed_cmds.py +79 -0
openstat/commands/mixture_changepoint_cmds.py +166 -0
openstat/commands/ml_adv_cmds.py +147 -0
openstat/commands/ml_cmds.py +178 -0
openstat/commands/model_eval_cmds.py +142 -0
openstat/commands/network_cmds.py +288 -0
openstat/commands/nlquery_cmds.py +161 -0
openstat/commands/nonparam_cmds.py +149 -0
openstat/commands/outreg_cmds.py +247 -0
openstat/commands/panel_cmds.py +141 -0
openstat/commands/pdf_cmds.py +226 -0
openstat/commands/pipeline_cmds.py +319 -0
openstat/commands/plot_cmds.py +189 -0
openstat/commands/plugin_cmds.py +79 -0
openstat/commands/posthoc_cmds.py +153 -0
openstat/commands/power_cmds.py +172 -0
openstat/commands/profile_cmds.py +246 -0
openstat/commands/rbridge_cmds.py +81 -0
openstat/commands/regex_cmds.py +104 -0
openstat/commands/report_cmds.py +48 -0
openstat/commands/repro_cmds.py +129 -0
openstat/commands/resampling_cmds.py +109 -0
openstat/commands/reshape_cmds.py +223 -0
openstat/commands/sem_cmds.py +177 -0
openstat/commands/stat_cmds.py +1040 -0
openstat/commands/stata_import_cmds.py +215 -0
openstat/commands/string_cmds.py +124 -0
openstat/commands/surv_cmds.py +145 -0
openstat/commands/survey_cmds.py +153 -0
openstat/commands/textanalysis_cmds.py +192 -0
openstat/commands/ts_adv_cmds.py +136 -0
openstat/commands/ts_cmds.py +195 -0
openstat/commands/tui_cmds.py +111 -0
openstat/commands/ux_cmds.py +191 -0
openstat/commands/validate_cmds.py +270 -0
openstat/commands/viz_adv_cmds.py +312 -0
openstat/commands/viz_extra_cmds.py +251 -0
openstat/commands/watch_cmds.py +69 -0
openstat/config.py +106 -0
openstat/dsl/__init__.py +0 -0
openstat/dsl/parser.py +332 -0
openstat/dsl/tokenizer.py +105 -0
openstat/i18n.py +120 -0
openstat/io/__init__.py +0 -0
openstat/io/loader.py +187 -0
openstat/jupyter/__init__.py +18 -0
openstat/jupyter/display.py +18 -0
openstat/jupyter/magic.py +60 -0
openstat/logging_config.py +59 -0
openstat/plots/__init__.py +0 -0
openstat/plots/plotter.py +437 -0
openstat/plots/surv_plots.py +32 -0
openstat/plots/ts_plots.py +59 -0
openstat/plugins/__init__.py +5 -0
openstat/plugins/manager.py +69 -0
openstat/repl.py +457 -0
openstat/reporting/__init__.py +0 -0
openstat/reporting/eda.py +208 -0
openstat/reporting/report.py +67 -0
openstat/script_runner.py +319 -0
openstat/session.py +133 -0
openstat/stats/__init__.py +0 -0
openstat/stats/advanced_regression.py +269 -0
openstat/stats/arch_garch.py +84 -0
openstat/stats/bayesian.py +103 -0
openstat/stats/causal.py +258 -0
openstat/stats/clustering.py +206 -0
openstat/stats/discrete.py +311 -0
openstat/stats/epidemiology.py +119 -0
openstat/stats/equiv_tobit.py +163 -0
openstat/stats/factor.py +174 -0
openstat/stats/imputation.py +282 -0
openstat/stats/influence.py +78 -0
openstat/stats/iv.py +131 -0
openstat/stats/manova.py +124 -0
openstat/stats/mixed.py +128 -0
openstat/stats/ml.py +275 -0
openstat/stats/ml_advanced.py +117 -0
openstat/stats/model_eval.py +183 -0
openstat/stats/models.py +1342 -0
openstat/stats/nonparametric.py +130 -0
openstat/stats/panel.py +179 -0
openstat/stats/power.py +295 -0
openstat/stats/resampling.py +203 -0
openstat/stats/survey.py +213 -0
openstat/stats/survival.py +196 -0
openstat/stats/timeseries.py +142 -0
openstat/stats/ts_advanced.py +114 -0
openstat/types.py +11 -0
openstat/web/__init__.py +1 -0
openstat/web/app.py +117 -0
openstat/web/session_manager.py +73 -0
openstat/web/static/app.js +117 -0
openstat/web/static/index.html +38 -0
openstat/web/static/style.css +103 -0
openstat_cli-1.0.0.dist-info/METADATA +748 -0
openstat_cli-1.0.0.dist-info/RECORD +143 -0
openstat_cli-1.0.0.dist-info/WHEEL +4 -0
openstat_cli-1.0.0.dist-info/entry_points.txt +2 -0
openstat_cli-1.0.0.dist-info/licenses/LICENSE +21 -0

openstat/stats/causal.py ADDED Viewed

@@ -0,0 +1,258 @@
+"""Causal inference models: Difference-in-Differences, Propensity Score Matching."""
+from __future__ import annotations
+import numpy as np
+import polars as pl
+import statsmodels.api as sm
+from scipy import stats as sp_stats
+from scipy.spatial import KDTree
+from openstat.stats.models import FitResult, _model_type_suffix
+# ── Difference-in-Differences ────────────────────────────────────────
+def fit_did(
+    df: pl.DataFrame,
+    dep: str,
+    treatment_col: str,
+    time_col: str,
+    *,
+    robust: bool = False,
+    cluster_col: str | None = None,
+) -> tuple[FitResult, object]:
+    """Fit a Difference-in-Differences model.
+    Model: y = b0 + b1*treatment + b2*post + b3*(treatment*post) + e
+    The DiD estimate is b3.
+    """
+    cols_needed = [dep, treatment_col, time_col]
+    if cluster_col:
+        cols_needed.append(cluster_col)
+    cols_needed = list(dict.fromkeys(cols_needed))
+    missing = [c for c in cols_needed if c not in df.columns]
+    if missing:
+        raise ValueError(f"Columns not found: {', '.join(missing)}")
+    sub = df.select(cols_needed).drop_nulls()
+    if sub.height == 0:
+        raise ValueError("No observations after dropping missing values")
+    warnings_list: list[str] = []
+    n_dropped = df.height - sub.height
+    if n_dropped > 0:
+        warnings_list.append(f"Note: {n_dropped} observation(s) dropped due to missing values.")
+    y = sub[dep].to_numpy().astype(float)
+    treat = sub[treatment_col].to_numpy().astype(float)
+    post = sub[time_col].to_numpy().astype(float)
+    interact = treat * post
+    X = np.column_stack([np.ones(len(y)), treat, post, interact])
+    var_names = ["_cons", treatment_col, time_col, f"{treatment_col}:{time_col}"]
+    # Cluster SE
+    groups = None
+    if cluster_col:
+        groups = sub[cluster_col].to_numpy()
+    if robust:
+        cov_type = "HC1"
+        cov_kwds: dict = {}
+    elif groups is not None:
+        cov_type = "cluster"
+        cov_kwds = {"groups": groups}
+    else:
+        cov_type = "nonrobust"
+        cov_kwds = {}
+    model = sm.OLS(y, X).fit(cov_type=cov_type, cov_kwds=cov_kwds)
+    ci = model.conf_int()
+    # Compute group means for diagnostics
+    treat_mask = treat == 1
+    control_mask = treat == 0
+    pre_mask = post == 0
+    post_mask = post == 1
+    means = {}
+    for label, t_mask, p_mask in [
+        ("Control, Pre", control_mask, pre_mask),
+        ("Control, Post", control_mask, post_mask),
+        ("Treatment, Pre", treat_mask, pre_mask),
+        ("Treatment, Post", treat_mask, post_mask),
+    ]:
+        mask = t_mask & p_mask
+        if mask.sum() > 0:
+            means[label] = float(np.mean(y[mask]))
+    if means:
+        warnings_list.append("Group means:")
+        for label, mean in means.items():
+            warnings_list.append(f"  {label}: {mean:.4f}")
+    did_coef = float(model.params[3])
+    did_se = float(model.bse[3])
+    did_p = float(model.pvalues[3])
+    warnings_list.append(f"DiD estimate: {did_coef:.4f} (SE={did_se:.4f}, p={did_p:.4f})")
+    suffix = _model_type_suffix(robust, groups is not None)
+    fit = FitResult(
+        model_type="DiD" + suffix,
+        formula=f"{dep} ~ {treatment_col} + {time_col} + {treatment_col}:{time_col}",
+        dep_var=dep,
+        indep_vars=[treatment_col, time_col],
+        n_obs=int(model.nobs),
+        params=dict(zip(var_names, model.params)),
+        std_errors=dict(zip(var_names, model.bse)),
+        t_values=dict(zip(var_names, model.tvalues)),
+        p_values=dict(zip(var_names, model.pvalues)),
+        conf_int_low=dict(zip(var_names, ci[:, 0])),
+        conf_int_high=dict(zip(var_names, ci[:, 1])),
+        r_squared=float(model.rsquared),
+        adj_r_squared=float(model.rsquared_adj),
+        f_statistic=float(model.fvalue) if model.fvalue is not None else None,
+        f_pvalue=float(model.f_pvalue) if model.f_pvalue is not None else None,
+        warnings=warnings_list,
+    )
+    return fit, model
+# ── Propensity Score Matching ────────────────────────────────────────
+def fit_psm(
+    df: pl.DataFrame,
+    outcome: str,
+    covariates: list[str],
+    treatment_col: str,
+    *,
+    n_neighbors: int = 1,
+    caliper: float | None = None,
+) -> str:
+    """Propensity Score Matching: estimate Average Treatment Effect on Treated (ATT).
+    Steps:
+    1. Logit model for propensity score P(T=1 | X)
+    2. KDTree nearest-neighbor matching
+    3. ATT = mean(Y_treated - Y_matched_control)
+    4. Bootstrap SE
+    """
+    all_cols = list(dict.fromkeys([outcome, treatment_col] + covariates))
+    missing = [c for c in all_cols if c not in df.columns]
+    if missing:
+        raise ValueError(f"Columns not found: {', '.join(missing)}")
+    sub = df.select(all_cols).drop_nulls()
+    if sub.height < 20:
+        raise ValueError(f"Too few observations ({sub.height}) for propensity score matching.")
+    y = sub[outcome].to_numpy().astype(float)
+    treat = sub[treatment_col].to_numpy().astype(float)
+    unique_t = set(treat)
+    if not unique_t.issubset({0.0, 1.0}):
+        raise ValueError(
+            f"Treatment variable must be binary (0/1). Found: {sorted(unique_t)[:10]}"
+        )
+    X = sub.select(covariates).to_numpy().astype(float)
+    X_with_const = sm.add_constant(X)
+    # Step 1: Propensity score via logit
+    logit_model = sm.Logit(treat, X_with_const).fit(disp=0)
+    pscore = logit_model.predict(X_with_const)
+    # Default caliper
+    if caliper is None:
+        caliper = 0.2 * np.std(pscore)
+    # Step 2: KDTree matching
+    treated_idx = np.where(treat == 1)[0]
+    control_idx = np.where(treat == 0)[0]
+    if len(treated_idx) == 0 or len(control_idx) == 0:
+        raise ValueError("Need both treated and control observations.")
+    control_ps = pscore[control_idx].reshape(-1, 1)
+    tree = KDTree(control_ps)
+    matched_treated = []
+    matched_control_outcomes = []
+    unmatched = 0
+    for t_i in treated_idx:
+        ps_t = pscore[t_i]
+        dists, idxs = tree.query([[ps_t]], k=n_neighbors)
+        dists = dists.flatten()
+        idxs = idxs.flatten()
+        # Apply caliper
+        valid = dists <= caliper
+        if not valid.any():
+            unmatched += 1
+            continue
+        matched_treated.append(y[t_i])
+        control_outcomes = [y[control_idx[idx]] for idx, v in zip(idxs, valid) if v]
+        matched_control_outcomes.append(np.mean(control_outcomes))
+    if len(matched_treated) < 5:
+        raise ValueError(
+            f"Only {len(matched_treated)} treated units matched. "
+            f"Try increasing caliper or reducing n_neighbors."
+        )
+    matched_treated = np.array(matched_treated)
+    matched_control_outcomes = np.array(matched_control_outcomes)
+    # Step 3: ATT
+    att = float(np.mean(matched_treated - matched_control_outcomes))
+    # Step 4: Bootstrap SE
+    n_boot = 50
+    rng = np.random.RandomState(42)
+    boot_atts = []
+    for _ in range(n_boot):
+        boot_idx = rng.choice(len(matched_treated), size=len(matched_treated), replace=True)
+        boot_att = float(np.mean(matched_treated[boot_idx] - matched_control_outcomes[boot_idx]))
+        boot_atts.append(boot_att)
+    se_att = float(np.std(boot_atts, ddof=1))
+    t_stat = att / se_att if se_att > 0 else np.nan
+    p_value = float(2 * (1 - sp_stats.norm.cdf(np.abs(t_stat))))
+    # Balance table: mean difference before/after matching
+    balance_lines = []
+    for i, cov in enumerate(covariates):
+        mean_t = float(np.mean(X[treated_idx, i]))
+        mean_c_all = float(np.mean(X[control_idx, i]))
+        # Matched controls (approximate via pscore-matched indices)
+        balance_lines.append(
+            f"  {cov:20s}  Treated: {mean_t:8.4f}  Control: {mean_c_all:8.4f}  "
+            f"Diff: {mean_t - mean_c_all:8.4f}"
+        )
+    lines = [
+        "Propensity Score Matching",
+        f"  Treatment variable: {treatment_col}",
+        f"  Outcome variable: {outcome}",
+        f"  Covariates: {', '.join(covariates)}",
+        f"  Neighbors: {n_neighbors}, Caliper: {caliper:.4f}",
+        "",
+        f"  N treated:  {len(treated_idx)}",
+        f"  N control:  {len(control_idx)}",
+        f"  Matched:    {len(matched_treated)}",
+        f"  Unmatched:  {unmatched}",
+        "",
+        f"  ATT:        {att:.4f}",
+        f"  SE:         {se_att:.4f}",
+        f"  t-stat:     {t_stat:.4f}",
+        f"  p-value:    {p_value:.4f}",
+        "",
+        "Covariate Balance (before matching):",
+    ] + balance_lines
+    return "\n".join(lines)

openstat/stats/clustering.py ADDED Viewed

@@ -0,0 +1,206 @@
+"""Clustering, MDS, and discriminant analysis."""
+from __future__ import annotations
+import numpy as np
+import polars as pl
+try:
+    from sklearn.cluster import KMeans, AgglomerativeClustering  # type: ignore[import]
+    from sklearn.manifold import MDS  # type: ignore[import]
+    from sklearn.discriminant_analysis import (  # type: ignore[import]
+        LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis,
+    )
+    from sklearn.preprocessing import StandardScaler, LabelEncoder  # type: ignore[import]
+    from sklearn.metrics import (  # type: ignore[import]
+        silhouette_score, calinski_harabasz_score, accuracy_score,
+    )
+    _HAS_SKLEARN = True
+except ImportError:
+    _HAS_SKLEARN = False
+def _require_sklearn():
+    if not _HAS_SKLEARN:
+        raise ImportError(
+            "scikit-learn is required for clustering commands.\n"
+            "Install: pip install scikit-learn"
+        )
+def _std(df: pl.DataFrame, cols: list[str]) -> np.ndarray:
+    X = df.select(cols).drop_nulls().to_numpy().astype(float)
+    from sklearn.preprocessing import StandardScaler
+    return StandardScaler().fit_transform(X), X
+# ── K-Means ────────────────────────────────────────────────────────────────
+def fit_kmeans(
+    df: pl.DataFrame,
+    cols: list[str],
+    *,
+    k: int = 3,
+    n_init: int = 10,
+    max_iter: int = 300,
+    random_state: int = 42,
+) -> dict:
+    """K-means clustering."""
+    _require_sklearn()
+    X_s, X_raw = _std(df, cols)
+    n = len(X_s)
+    model = KMeans(n_clusters=k, n_init=n_init, max_iter=max_iter, random_state=random_state)
+    labels = model.fit_predict(X_s)
+    sil = float(silhouette_score(X_s, labels)) if k > 1 else float("nan")
+    ch = float(calinski_harabasz_score(X_s, labels)) if k > 1 else float("nan")
+    inertia = float(model.inertia_)
+    cluster_sizes = {int(i): int((labels == i).sum()) for i in range(k)}
+    # Cluster centroids (in original scale)
+    centroids_std = model.cluster_centers_
+    # Back-transform using per-column stats
+    means = X_raw.mean(axis=0)
+    stds = X_raw.std(axis=0) + 1e-15
+    centroids_orig = centroids_std * stds + means
+    return {
+        "method": "K-Means",
+        "cols": cols,
+        "k": k,
+        "n_obs": n,
+        "inertia": inertia,
+        "silhouette_score": sil,
+        "calinski_harabasz": ch,
+        "cluster_sizes": cluster_sizes,
+        "centroids": centroids_orig.tolist(),
+        "labels": labels.tolist(),
+        "_model": model,
+    }
+# ── Hierarchical (Agglomerative) ───────────────────────────────────────────
+def fit_hierarchical(
+    df: pl.DataFrame,
+    cols: list[str],
+    *,
+    k: int = 3,
+    linkage: str = "ward",
+    metric: str = "euclidean",
+) -> dict:
+    """Agglomerative hierarchical clustering."""
+    _require_sklearn()
+    X_s, _ = _std(df, cols)
+    n = len(X_s)
+    link = linkage if linkage != "ward" or metric == "euclidean" else "average"
+    model = AgglomerativeClustering(n_clusters=k, linkage=link)
+    labels = model.fit_predict(X_s)
+    sil = float(silhouette_score(X_s, labels)) if k > 1 else float("nan")
+    ch = float(calinski_harabasz_score(X_s, labels)) if k > 1 else float("nan")
+    cluster_sizes = {int(i): int((labels == i).sum()) for i in range(k)}
+    return {
+        "method": "Hierarchical",
+        "cols": cols,
+        "k": k,
+        "linkage": linkage,
+        "n_obs": n,
+        "silhouette_score": sil,
+        "calinski_harabasz": ch,
+        "cluster_sizes": cluster_sizes,
+        "labels": labels.tolist(),
+        "_model": model,
+    }
+# ── MDS ────────────────────────────────────────────────────────────────────
+def fit_mds(
+    df: pl.DataFrame,
+    cols: list[str],
+    *,
+    n_components: int = 2,
+    metric: bool = True,
+    random_state: int = 42,
+) -> dict:
+    """Multidimensional Scaling."""
+    _require_sklearn()
+    X_s, _ = _std(df, cols)
+    model = MDS(
+        n_components=n_components,
+        metric=metric,
+        random_state=random_state,
+        normalized_stress="auto",
+    )
+    coords = model.fit_transform(X_s)
+    stress = float(model.stress_)
+    return {
+        "method": "MDS",
+        "cols": cols,
+        "n_components": n_components,
+        "metric": metric,
+        "stress": stress,
+        "n_obs": len(X_s),
+        "coordinates": coords.tolist(),
+        "_model": model,
+    }
+# ── Discriminant Analysis ──────────────────────────────────────────────────
+def fit_discriminant(
+    df: pl.DataFrame,
+    dep: str,
+    indeps: list[str],
+    *,
+    method: str = "lda",
+) -> dict:
+    """Linear or Quadratic Discriminant Analysis."""
+    _require_sklearn()
+    sub = df.select([dep] + indeps).drop_nulls()
+    y_raw = sub[dep].to_numpy()
+    X = sub.select(indeps).to_numpy().astype(float)
+    le = LabelEncoder()
+    y = le.fit_transform(y_raw.astype(str))
+    if method.lower() == "qda":
+        model = QuadraticDiscriminantAnalysis()
+    else:
+        model = LinearDiscriminantAnalysis()
+    model.fit(X, y)
+    y_pred = model.predict(X)
+    acc = float(accuracy_score(y, y_pred))
+    classes = le.classes_.tolist()
+    prior = model.priors_.tolist() if hasattr(model, "priors_") else []
+    result = {
+        "method": method.upper(),
+        "dep": dep,
+        "indeps": indeps,
+        "classes": classes,
+        "n_classes": len(classes),
+        "priors": prior,
+        "accuracy": acc,
+        "n_obs": len(y),
+        "_model": model,
+        "_le": le,
+    }
+    # LDA-specific: discriminant function coefficients
+    if method.lower() == "lda" and hasattr(model, "coef_"):
+        result["coefficients"] = {
+            cls: dict(zip(indeps, model.coef_[i].tolist()))
+            for i, cls in enumerate(classes[1:])  # k-1 functions
+        }
+    return result