PyPI - marginaleffects - Versions diffs - 0.5.0__tar.gz → 0.5.1__tar.gz - Mend

marginaleffects 0.5.0tar.gz → 0.5.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: marginaleffects
-Version: 0.5.0
+Version: 0.5.1
 Summary: Predictions, counterfactual comparisons, slopes, and hypothesis tests for statistical models.
 License-Expression: GPL-3.0-or-later
 Requires-Python: >=3.10

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/by.py RENAMED Viewed

@@ -1,5 +1,4 @@
 import polars as pl
-import numpy as np
 from typing import List, Optional, Tuple
@@ -66,8 +65,7 @@ def _get_by_internal(
     else:
         out = pl.DataFrame({"estimate": estimand["estimate"]})
-    by = [x for x in by if x in out.columns]
-    by = np.unique(by)
+    by = list(dict.fromkeys(x for x in by if x in out.columns))
     if isinstance(by, list) and len(by) == 0:
         if return_groups and "rowid" in out.columns:

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/comparisons.py RENAMED Viewed

@@ -1,4 +1,5 @@
 import re
+import warnings
 from functools import reduce
 import numpy as np
@@ -118,28 +119,22 @@ def _build_comparison_frames(newdata, variables, cross):
                 hi.append(hi_row)
                 lo.append(lo_row)
         else:
-            hi.append(newdata)
-            lo.append(newdata)
-            nd.append(newdata)
+            nd_row = newdata.clone()
+            hi_row = newdata.clone()
+            lo_row = newdata.clone()
             for v in variables:
                 vcomp = "custom" if callable(v.comparison) else v.comparison
-                nd[0] = nd[0].with_columns(
+                shared = [
                     pl.lit(v.variable).alias("term"),
                     pl.lit(v.lab).alias(f"contrast_{v.variable}"),
                     pl.lit(vcomp).alias("marginaleffects_comparison"),
-                )
-                hi[0] = hi[0].with_columns(
-                    pl.lit(v.hi).alias(v.variable),
-                    pl.lit(v.variable).alias("term"),
-                    pl.lit(v.lab).alias(f"contrast_{v.variable}"),
-                    pl.lit(vcomp).alias("marginaleffects_comparison"),
-                )
-                lo[0] = lo[0].with_columns(
-                    pl.lit(v.lo).alias(v.variable),
-                    pl.lit(v.variable).alias("term"),
-                    pl.lit(v.lab).alias(f"contrast_{v.variable}"),
-                    pl.lit(vcomp).alias("marginaleffects_comparison"),
-                )
+                ]
+                nd_row = nd_row.with_columns(*shared)
+                hi_row = hi_row.with_columns(pl.lit(v.hi).alias(v.variable), *shared)
+                lo_row = lo_row.with_columns(pl.lit(v.lo).alias(v.variable), *shared)
+            nd.append(nd_row)
+            hi.append(hi_row)
+            lo.append(lo_row)
     return nd, hi, lo
@@ -166,9 +161,10 @@ def _finalize_counterfactual_frames(
     pad_df = upcast(pad_df, hi)
     nd = upcast(nd, hi)
-    dfs_to_align = [("nd", nd), ("hi", hi), ("lo", lo)]
+    dfs = {"nd": nd, "hi": hi, "lo": lo}
-    for df_name, df in dfs_to_align:
+    for df_name in dfs:
+        df = dfs[df_name]
         common_cols = set(pad_df.columns) & set(df.columns)
         for col in common_cols:
             pad_dtype = str(pad_df[col].dtype)
@@ -189,8 +185,8 @@ def _finalize_counterfactual_frames(
                                 .alias(col)
                             )
                     except Exception as e:
-                        print(
-                            f"Warning: Could not convert List column {col} to strings: {e}"
+                        warnings.warn(
+                            f"Could not convert List column {col} to strings: {e}"
                         )
                         try:
                             if col in pad_df.columns and pad_df.height > 0:
@@ -198,7 +194,7 @@ def _finalize_counterfactual_frames(
                             if col in df.columns and df.height > 0:
                                 df = df.explode(col)
                         except Exception as e2:
-                            print(f"Warning: Could not explode List column {col}: {e2}")
+                            warnings.warn(f"Could not explode List column {col}: {e2}")
                             if col in pad_df.columns:
                                 pad_df = pad_df.with_columns(
                                     pad_df[col].cast(pl.String).alias(col)
@@ -206,12 +202,9 @@ def _finalize_counterfactual_frames(
                             if col in df.columns:
                                 df = df.with_columns(df[col].cast(pl.String).alias(col))
-        if df_name == "nd":
-            nd = df
-        elif df_name == "hi":
-            hi = df
-        elif df_name == "lo":
-            lo = df
+        dfs[df_name] = df
+    nd, hi, lo = dfs["nd"], dfs["hi"], dfs["lo"]
     nd = pl.concat([pad_df, nd], how="diagonal")
     hi = pl.concat([pad_df, hi], how="diagonal")
@@ -221,9 +214,7 @@ def _finalize_counterfactual_frames(
     categorical_list_cols = []
     for col in list_cols:
         dtype_str = str(nd[col].dtype)
-        if (
-            "Enum(" in dtype_str or "String" in dtype_str or "UInt32" in dtype_str
-        ) and col in ["Region"]:
+        if "Enum(" in dtype_str or "String" in dtype_str or "UInt32" in dtype_str:
             categorical_list_cols.append(col)
     if categorical_list_cols:
@@ -241,7 +232,7 @@ def _prepare_design_matrices(model, nd, hi, lo, pad_rows):
     lo_X = model.get_exog(lo)
     nd_X = model.get_exog(nd)
-    if pad_rows >= 0:
+    if pad_rows > 0:
         nd_X = nd_X[pad_rows:]
         hi_X = hi_X[pad_rows:]
         lo_X = lo_X[pad_rows:]

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/estimands.py RENAMED Viewed

@@ -42,12 +42,12 @@ estimands = {
     "ratio": lambda hi, lo, eps, x, y, w: prep(hi / lo),
     "ratioavg": lambda hi, lo, eps, x, y, w: prep(hi.mean() / lo.mean()),
     "ratioavgwts": lambda hi, lo, eps, x, y, w: prep(
-        (hi * w).sum() / w.sum() / (lo * w).sum() / w.sum()
+        ((hi * w).sum() / w.sum()) / ((lo * w).sum() / w.sum())
     ),
     "lnratio": lambda hi, lo, eps, x, y, w: prep(np.log(hi / lo)),
     "lnratioavg": lambda hi, lo, eps, x, y, w: prep(np.log(hi.mean() / lo.mean())),
     "lnratioavgwts": lambda hi, lo, eps, x, y, w: prep(
-        np.log((hi * w).sum() / w.sum() / (lo * w).sum() / w.sum())
+        np.log(((hi * w).sum() / w.sum()) / ((lo * w).sum() / w.sum()))
     ),
     "lnor": lambda hi, lo, eps, x, y, w: prep(
         np.log((hi / (1 - hi)) / (lo / (1 - lo)))
@@ -69,7 +69,7 @@ estimands = {
     "expdydxavg": lambda hi, lo, eps, x, y, w: prep(
         np.mean(((hi.exp() - lo.exp()) / np.exp(eps)) / eps)
     ),
-    "expdydxavgwts": lambda hi, lo, eps, x, y, w: (
-        prep((((np.exp(hi) - np.exp(lo)) / np.exp(eps)) / eps) * w).sum() / w.sum()
+    "expdydxavgwts": lambda hi, lo, eps, x, y, w: prep(
+        ((((np.exp(hi) - np.exp(lo)) / np.exp(eps)) / eps) * w).sum() / w.sum()
     ),
 }

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/plot/common.py RENAMED Viewed

@@ -1,3 +1,5 @@
+import copy
 import numpy as np
 from ..datagrid import datagrid  # noqa
 from ..sanitize import sanitize_model
@@ -7,7 +9,7 @@ import polars as pl
 def dt_on_condition(model, condition):
     model = sanitize_model(model)
-    condition_new = condition  # two pointers to the same object? this looks like a bug
+    condition_new = copy.deepcopy(condition)
     # not sure why newdata gets added
     modeldata = model.get_modeldata()
@@ -19,28 +21,24 @@ def dt_on_condition(model, condition):
     first_key = ""  # special case when the first element is numeric
     if isinstance(condition_new, list):
-        assert all(ele in modeldata.columns for ele in condition_new), (
-            "All elements of condition must be columns of the model."
-        )
+        if not all(ele in modeldata.columns for ele in condition_new):
+            raise ValueError("All elements of condition must be columns of the model.")
         first_key = condition_new[0]
         to_datagrid = {key: None for key in condition_new}
     elif isinstance(condition_new, dict):
-        assert all(key in modeldata.columns for key in condition_new.keys()), (
-            "All keys of condition must be columns of the model."
-        )
+        if not all(key in modeldata.columns for key in condition_new.keys()):
+            raise ValueError("All keys of condition must be columns of the model.")
         first_key = next(iter(condition_new))
-        to_datagrid = (
-            condition_new  # third pointer to the same object? looks like a BUG
-        )
+        to_datagrid = condition_new
-    # not sure why `newdata` sometimes gets added
     if isinstance(condition_new, dict) and "newdata" in to_datagrid.keys():
         condition_new.pop("newdata", None)
-    assert 1 <= len(condition_new) <= 4, (
-        f"Lenght of condition must be inclusively between 1 and 4. Got : {len(condition_new)}."
-    )
+    if not (1 <= len(condition_new) <= 4):
+        raise ValueError(
+            f"Length of condition must be inclusively between 1 and 4. Got: {len(condition_new)}."
+        )
     for key, value in to_datagrid.items():
         variable_type = model.get_variable_type(key)
@@ -51,20 +49,17 @@ def dt_on_condition(model, condition):
             )
         elif variable_type in ["character"]:
-            # get specified names of the condition
-            # here is the BUG, we take the values of "species" back from the model
             to_datagrid[key] = (
                 to_datagrid[key]
                 if to_datagrid[key]
                 else modeldata[key].unique().sort().to_list()
             )
-            assert len(to_datagrid[key]) <= 10, (
-                f"Character type variables of more than 10 unique values are not supported. {key} variable has {len(to_datagrid[key])} unique values."
-            )
+            if len(to_datagrid[key]) > 10:
+                raise ValueError(
+                    f"Character type variables of more than 10 unique values are not supported. {key} variable has {len(to_datagrid[key])} unique values."
+                )
         elif variable_type in ["boolean", "binary"]:
-            # get specified names of the condition
-            # here is the BUG, we take the values of "species" back from the model
             if to_datagrid[key] is None:
                 to_datagrid[key] = modeldata[key].unique().sort().to_list()
@@ -131,15 +126,14 @@ def ordered_cat(dt, k, lab):
 def validate_plot_args(condition, by, newdata, wts):
-    assert not (not by and newdata is not None), (
-        "The `newdata` argument requires a `by` argument."
-    )
-    assert not (wts is not None and not by), (
-        "The `wts` argument requires a `by` argument."
-    )
-    assert (condition is None and by) or (condition is not None and not by), (
-        "One of the `condition` and `by` arguments must be supplied, but not both."
-    )
+    if not by and newdata is not None:
+        raise ValueError("The `newdata` argument requires a `by` argument.")
+    if wts is not None and not by:
+        raise ValueError("The `wts` argument requires a `by` argument.")
+    if not ((condition is None and by) or (condition is not None and not by)):
+        raise ValueError(
+            "One of the `condition` and `by` arguments must be supplied, but not both."
+        )
 def extract_var_list(condition, by):
@@ -158,9 +152,10 @@ def extract_var_list(condition, by):
     var_list = [x for x in var_list if x not in ["newdata", "model"]]
-    assert len(var_list) < 5, (
-        "The `condition` and `by` arguments can have a max length of 4."
-    )
+    if len(var_list) >= 5:
+        raise ValueError(
+            "The `condition` and `by` arguments can have a max length of 4."
+        )
     return var_list
@@ -286,10 +281,10 @@ def plot_common(model, dt, y_label, var_list, gray=False, points=0):
         if len(var_list) > 1:
             if gray:
                 # get the number of unique values in the column "var_list[1]"
-                unique_values = dt[var_list[1]].unique().len()
-                if unique_values > 5:
+                unique_values = dt[var_list[1]].unique()
+                if unique_values.len() > 5:
                     raise ValueError(
-                        f"The number of elements in the second position of the `condition` or `by` argument (variable {var_list[1]}) cannot exceed 5. It has currently {len(unique_values)} elements, with values {unique_values}."
+                        f"The number of elements in the second position of the `condition` or `by` argument (variable {var_list[1]}) cannot exceed 5. It has currently {unique_values.len()} elements, with values {unique_values.to_list()}."
                     )
                 custom_line_types = [
                     "solid",

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/sanitize/comparison.py RENAMED Viewed

@@ -73,8 +73,7 @@ def sanitize_comparison(comparison, by, wts=None):
         "expdydx": "exp(dY/dX)",
     }
-    assert out in lab.keys(), (
-        f"`comparison` must be one of: {', '.join(list(lab.keys()))}."
-    )
+    if out not in lab.keys():
+        raise ValueError(f"`comparison` must be one of: {', '.join(list(lab.keys()))}.")
     return (out, lab[out])

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/sanitize/newdata.py RENAMED Viewed

@@ -1,12 +1,16 @@
 import numpy as np
 import polars as pl
-from ..datagrid import datagrid
-from ..utils import ingest, upcast
 from ..formula import listwise_deletion
 def sanitize_newdata(model, newdata, wts, by=[]):
+    # Lazy imports to break the `datagrid -> utils -> sanitize -> newdata -> ...`
+    # circular import that fires when `datagrid` is the first symbol pulled from
+    # marginaleffects in a fresh interpreter (see GH #1724).
+    from ..datagrid import datagrid
+    from ..utils import ingest, upcast
     modeldata = model.get_modeldata()
     if newdata is None:
@@ -72,9 +76,10 @@ def sanitize_newdata(model, newdata, wts, by=[]):
         "contrast",
         "statistic",
     }
-    assert not (set(out.columns) & reserved_names), (
-        f"Input data contain reserved column name(s) : {set(out.columns).intersection(reserved_names)}"
-    )
+    if set(out.columns) & reserved_names:
+        raise ValueError(
+            f"Input data contain reserved column name(s): {set(out.columns).intersection(reserved_names)}"
+        )
     datagrid_explicit = None
     if isinstance(newdata, pl.DataFrame) and hasattr(newdata, "datagrid_explicit"):

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/sanitize/variables.py RENAMED Viewed

@@ -11,17 +11,11 @@ HiLo = namedtuple("HiLo", ["variable", "hi", "lo", "lab", "pad", "comparison"])
 def _clean_global(k, n):
-    if (
-        not isinstance(k, list)
-        and not isinstance(k, pl.Series)
-        and not isinstance(k, np.ndarray)
-    ):
-        out = [k]
+    if isinstance(k, (pl.Series, np.ndarray)):
+        return pl.Series(k) if len(k) > 1 else pl.Series(np.repeat(k[0], n))
     if not isinstance(k, list) or len(k) == 1:
-        out = pl.Series(np.repeat(k, n))
-    else:
-        out = pl.Series(k)
-    return out
+        return pl.Series(np.repeat(k, n))
+    return pl.Series(k)
 def _get_one_variable_hi_lo(
@@ -153,9 +147,10 @@ def _get_one_variable_hi_lo(
         elif callable(value):
             tmp = value(newdata[variable])
-            assert tmp.shape[1] == 2, (
-                f"The function passed to `variables` must return a DataFrame with two columns. Got {tmp.shape[1]}."
-            )
+            if tmp.shape[1] != 2:
+                raise ValueError(
+                    f"The function passed to `variables` must return a DataFrame with two columns. Got {tmp.shape[1]}."
+                )
             lo = tmp[:, 0]
             hi = tmp[:, 1]
             lab = "custom"
@@ -225,9 +220,8 @@ def sanitize_variables(
             )
     elif isinstance(variables, dict):
-        for v in variables:
+        for v in list(variables.keys()):
             if v not in newdata.columns:
-                del variables[v]
                 warn(f"Variable {v} is not in newdata.")
             else:
                 out.append(

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/sanitize/vcov.py RENAMED Viewed

@@ -14,5 +14,6 @@ def sanitize_vcov(vcov, model):
     V = model.get_vcov(vcov)
     if V is not None:
-        assert isinstance(V, np.ndarray), "vcov must be True or a square NumPy array"
+        if not isinstance(V, np.ndarray):
+            raise TypeError("vcov must be True or a square NumPy array")
     return V

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/uncertainty.py RENAMED Viewed

@@ -6,36 +6,29 @@ import scipy.stats as stats
 def get_jacobian(func, coefs, eps_vcov=None):
-    # forward finite difference (faster)
+    original_shape = None
     if coefs.ndim == 2:
+        original_shape = coefs.shape
         if isinstance(coefs, np.ndarray):
             coefs_flat = coefs.flatten(order="F")
         else:
             coefs_flat = coefs.to_numpy().flatten(order="F")
-        baseline = func(coefs)["estimate"].to_numpy()
-        jac = np.empty((baseline.shape[0], len(coefs_flat)), dtype=np.float64)
-        for i, xi in enumerate(coefs_flat):
-            if eps_vcov is not None:
-                h = eps_vcov
-            else:
-                h = max(abs(xi) * np.sqrt(np.finfo(float).eps), 1e-10)
-            dx = np.copy(coefs_flat)
-            dx[i] = dx[i] + h
-            tmp = dx.reshape(coefs.shape, order="F")
-            jac[:, i] = (func(tmp)["estimate"].to_numpy() - baseline) / h
-        return jac
     else:
-        baseline = func(coefs)["estimate"].to_numpy()
-        jac = np.empty((baseline.shape[0], len(coefs)), dtype=np.float64)
-        for i, xi in enumerate(coefs):
-            if eps_vcov is not None:
-                h = eps_vcov
-            else:
-                h = max(abs(xi) * np.sqrt(np.finfo(float).eps), 1e-10)
-            dx = np.copy(coefs)
-            dx[i] = dx[i] + h
-            jac[:, i] = (func(dx)["estimate"].to_numpy() - baseline) / h
-        return jac
+        coefs_flat = np.asarray(coefs)
+    baseline = func(coefs)["estimate"].to_numpy()
+    jac = np.empty((baseline.shape[0], len(coefs_flat)), dtype=np.float64)
+    for i, xi in enumerate(coefs_flat):
+        if eps_vcov is not None:
+            h = eps_vcov
+        else:
+            h = max(abs(xi) * np.sqrt(np.finfo(float).eps), 1e-10)
+        dx = np.copy(coefs_flat)
+        dx[i] = dx[i] + h
+        if original_shape is not None:
+            dx = dx.reshape(original_shape, order="F")
+        jac[:, i] = (func(dx)["estimate"].to_numpy() - baseline) / h
+    return jac
 def get_se(J, V):
@@ -65,7 +58,7 @@ def get_z_p_ci(df, model, conf_level, hypothesis_null=0):
             "statistic"
         )
     )
-    if hasattr(model, "df_resid") and isinstance(model.df_resid, float):
+    if hasattr(model, "df_resid") and isinstance(model.df_resid, (int, float)):
         dof = model.df_resid
     else:
         dof = np.inf
@@ -93,6 +86,6 @@ def get_z_p_ci(df, model, conf_level, hypothesis_null=0):
                 .map_batches(lambda x: -np.log2(x), return_dtype=pl.Float64)
                 .alias("s_value")
             )
-        except Exception as e:
-            print(f"An exception occurred: {e}")
+        except Exception:
+            pass
     return df

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects/utils.py RENAMED Viewed

@@ -116,7 +116,7 @@ def upcast(df, reference):
         pl.Float64,
     ]
     for col in df.columns:
-        if col in df.columns and col in reference.columns:
+        if col in reference.columns:
             good = reference[col].dtype
             bad = df[col].dtype
             if good != bad:

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/marginaleffects.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: marginaleffects
-Version: 0.5.0
+Version: 0.5.1
 Summary: Predictions, counterfactual comparisons, slopes, and hypothesis tests for statistical models.
 License-Expression: GPL-3.0-or-later
 Requires-Python: >=3.10

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "marginaleffects"
-version = "0.5.0"
+version = "0.5.1"
 license = "GPL-3.0-or-later"
 description = "Predictions, counterfactual comparisons, slopes, and hypothesis tests for statistical models."
 readme = "README.md"

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/tests/test_bugfix.py RENAMED Viewed

@@ -1,3 +1,6 @@
+import subprocess
+import sys
 import numpy as np
 import pandas as pd
 import polars as pl
@@ -22,3 +25,18 @@ def test_issue_226_np_context():
     out = predictions(mod, newdata=df)
     assert isinstance(out, MarginaleffectsResult)
     assert isinstance(out.data, pl.DataFrame)
+def test_issue_1724():
+    # Circular import when `datagrid` is the first symbol pulled from
+    # marginaleffects in a fresh interpreter. Must run in a subprocess —
+    # the in-process pytest run has already warmed the import graph.
+    result = subprocess.run(
+        [sys.executable, "-c", "from marginaleffects import datagrid"],
+        capture_output=True,
+        text=True,
+    )
+    assert result.returncode == 0, (
+        f"Fresh-process import of `datagrid` failed.\n"
+        f"stdout: {result.stdout}\nstderr: {result.stderr}"
+    )

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/tests/test_comparisons.py RENAMED Viewed

@@ -200,7 +200,7 @@ def test_lift():
     cmp2 = comparisons(mod, comparison="liftavg")
     assert cmp1.shape[0] == 32
     assert cmp2.shape[0] == 1
-    with pytest.raises(AssertionError):
+    with pytest.raises(ValueError):
         comparisons(mod, comparison="liftr")

{marginaleffects-0.5.0 → marginaleffects-0.5.1}/tests/test_pyfixest.py RENAMED Viewed

@@ -204,7 +204,7 @@ def test_pyfixest_standard_errors_across_models():
     fit_pois_fe = fepois("Y ~ X1 * X2 * Z1 | f1", data=poisson_data)
     with pytest.warns(
         UserWarning,
-        match="uncertainty in fixed-effects parameters when computing contrasts",
+        match="cannot take into account the uncertainty in fixed-effects",
     ):
         try:
             comp_pois_fe = comparisons(fit_pois_fe)