PyPI - tabmat - Versions diffs - 4.1.5__cp310-cp310-win_amd64.whl → 4.2.1__cp310-cp310-win_amd64.whl - Mend

tabmat 4.1.5__cp310-cp310-win_amd64.whl → 4.2.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

tabmat/benchmark/generate_matrices.py +1 -1
tabmat/categorical_matrix.py +9 -3
tabmat/constructor.py +1 -1
tabmat/ext/categorical.cp310-win_amd64.pyd +0 -0
tabmat/ext/dense.cp310-win_amd64.pyd +0 -0
tabmat/ext/sparse.cp310-win_amd64.pyd +0 -0
tabmat/ext/split.cp310-win_amd64.pyd +0 -0
tabmat/formula.py +60 -48
tabmat/split_matrix.py +1 -1
{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/METADATA +3 -4
tabmat-4.2.1.dist-info/RECORD +26 -0
{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/WHEEL +1 -1
tabmat-4.1.5.dist-info/RECORD +0 -26
{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/licenses/LICENSE +0 -0
{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/top_level.txt +0 -0

tabmat/benchmark/generate_matrices.py CHANGED Viewed

@@ -13,7 +13,7 @@ def make_dense_matrices(n_rows: int, n_cols: int) -> dict:
     dense_matrices = {"numpy_C": np.random.random((n_rows, n_cols))}
     dense_matrices["numpy_F"] = dense_matrices["numpy_C"].copy(order="F")
     assert dense_matrices["numpy_F"].flags["F_CONTIGUOUS"]
-    dense_matrices["tabmat"] = tm.DenseMatrix(dense_matrices["numpy_C"])
+    dense_matrices["tabmat"] = tm.DenseMatrix(dense_matrices["numpy_C"])  # type: ignore
     return dense_matrices

tabmat/categorical_matrix.py CHANGED Viewed

@@ -240,7 +240,13 @@ def _extract_codes_and_categories_pandas(cat_vec) -> tuple[np.ndarray, np.ndarra
 def _extract_codes_and_categories_polars(cat_vec) -> tuple[np.ndarray, np.ndarray]:
-    if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)):
+    dtype = cat_vec.dtype
+    if isinstance(dtype, pl.Enum):
+        categories = cat_vec.cat.get_categories().to_numpy()
+        indices = cat_vec.to_physical().fill_null(-1).to_numpy()
+        return indices, categories
+    if not isinstance(cat_vec.dtype, pl.Categorical):
         cat_vec = cat_vec.cast(pl.Categorical)
     # as of polars 1.32, `get_categories()` won't yield a useful result as
     # this is "not per column" anymore.
@@ -300,7 +306,7 @@ def _row_col_indexing(
     is_col_indexed = not (cols is None or len(cols) == arr.shape[1])
     if is_row_indexed and is_col_indexed:
-        return arr[np.ix_(rows, cols)]
+        return arr[np.ix_(rows, cols)]  # type: ignore
     elif is_row_indexed:
         return arr[rows]
     elif is_col_indexed:
@@ -691,7 +697,7 @@ class CategoricalMatrix(MatrixBase):
             )
         # TODO: data should be uint8
-        data = np.ones(self.shape[0], dtype=int)
+        data: np.ndarray = np.ones(self.shape[0], dtype=int)
         return sps.csr_matrix(
             (data, self.indices, np.arange(self.shape[0] + 1, dtype=int)),
             shape=self.shape,

tabmat/constructor.py CHANGED Viewed

@@ -396,7 +396,7 @@ def from_formula(
     )
     result = materializer.get_model_matrix(spec)
-    term_names = np.zeros(len(result.term_names), dtype="object")
+    term_names: np.ndarray = np.zeros(len(result.term_names), dtype="object")
     for term, indices in result.model_spec.term_indices.items():
         term_names[indices] = str(term)
     result.term_names = term_names.tolist()

tabmat/ext/categorical.cp310-win_amd64.pyd CHANGED Viewed

Binary file

tabmat/ext/dense.cp310-win_amd64.pyd CHANGED Viewed

Binary file

tabmat/ext/sparse.cp310-win_amd64.pyd CHANGED Viewed

Binary file

tabmat/ext/split.cp310-win_amd64.pyd CHANGED Viewed

Binary file

tabmat/formula.py CHANGED Viewed

@@ -5,19 +5,21 @@ from collections import OrderedDict
 from collections.abc import Iterable
 from typing import Any, Optional, Union
+import narwhals.stable.v2 as nw
 import numpy as np
 import numpy.typing
 import pandas as pd
 from formulaic import ModelMatrix, ModelSpec
 from formulaic.errors import FactorEncodingError
 from formulaic.materializers import FormulaMaterializer
-from formulaic.materializers.types import FactorValues, NAAction, ScopedTerm
+from formulaic.materializers.types import FactorValues, ScopedTerm
 from formulaic.parser.types import Term
 from formulaic.transforms import stateful_transform
+from formulaic.utils.null_handling import drop_rows as drop_nulls
 from interface_meta import override
 from scipy import sparse as sps
-from .categorical_matrix import CategoricalMatrix
+from .categorical_matrix import CategoricalMatrix, _extract_codes_and_categories
 from .constructor_util import _split_sparse_and_dense_parts
 from .dense_matrix import DenseMatrix
 from .matrix_base import MatrixBase
@@ -53,34 +55,24 @@ class TabmatMaterializer(FormulaMaterializer):
         self.cat_missing_method = self.params.get("cat_missing_method", "fail")
         self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)")
+        # Always convert input to narwhals DataFrame
+        self.__narwhals_data = nw.from_native(self.data, eager_only=True)
+        self.__data_context = self.__narwhals_data.to_dict()
         # We can override formulaic's C() function here
         self.context["C"] = _C
-    @override
-    def _is_categorical(self, values):
-        if isinstance(values, (pd.Series, pd.Categorical)):
-            return values.dtype == object or isinstance(
-                values.dtype, (pd.CategoricalDtype, pd.StringDtype)
-            )
-        return super()._is_categorical(values)
+    @override  # type: ignore
+    @property
+    def data_context(self):
+        return self.__data_context
     @override
-    def _check_for_nulls(self, name, values, na_action, drop_rows):
-        if na_action is NAAction.IGNORE:
-            return
-        if na_action is NAAction.RAISE:
-            if isinstance(values, pd.Series) and values.isnull().values.any():
-                raise ValueError(f"`{name}` contains null values after evaluation.")
-        elif na_action is NAAction.DROP:
-            if isinstance(values, pd.Series):
-                drop_rows.update(np.flatnonzero(values.isnull().values))
-        else:
-            raise ValueError(
-                f"Do not know how to interpret `na_action` = {repr(na_action)}."
-            )
+    def _is_categorical(self, values: Any) -> bool:
+        if nw.dependencies.is_narwhals_series(values):
+            if not values.dtype.is_numeric():
+                return True
+        return super()._is_categorical(values)
     @override
     def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows):
@@ -90,9 +82,9 @@ class TabmatMaterializer(FormulaMaterializer):
     @override
     def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows):
         if drop_rows:
-            values = values.drop(index=values.index[drop_rows])
-        if isinstance(values, pd.Series):
-            values = values.to_numpy().astype(self.dtype, copy=False)
+            values = drop_nulls(values, indices=drop_rows)
+        if isinstance(values, nw.Series):
+            values = values.to_numpy().astype(self.dtype)
         if (values != 0).mean() <= self.sparse_threshold:
             return _InteractableSparseVector(sps.csc_matrix(values[:, np.newaxis]))
         else:
@@ -104,7 +96,7 @@ class TabmatMaterializer(FormulaMaterializer):
     ):
         # We do not do any encoding here as it is handled by tabmat
         if drop_rows:
-            values = values.drop(index=values.index[drop_rows])
+            values = drop_nulls(values, indices=drop_rows)
         return encode_contrasts(
             values,
             reduced_rank=reduced_rank,
@@ -428,17 +420,22 @@ class _InteractableCategoricalVector(_InteractableVector):
         self.name = name
     @classmethod
-    def from_categorical(
+    def from_codes(
         cls,
-        cat: pd.Categorical,
+        codes: np.ndarray,
+        categories: list,
         reduced_rank: bool,
         missing_method: str = "fail",
         missing_name: str = "(MISSING)",
         add_missing_category: bool = False,
     ) -> "_InteractableCategoricalVector":
-        """Create an interactable categorical vector from a pandas categorical."""
-        categories = cat.categories.tolist()
-        codes = cat.codes.copy().astype(np.int64)
+        """Create an interactable categorical vector from integer codes and categories.
+        The `codes` array is expected to contain integer category codes, using
+        -1 for missing values and -2 for rows that should be dropped.
+        """
+        codes = codes.copy().astype(np.int64)
+        categories = categories.copy()
         if reduced_rank:
             codes[codes == 0] = -2
@@ -458,7 +455,7 @@ class _InteractableCategoricalVector(_InteractableVector):
         return cls(
             codes=codes,
             categories=categories,
-            multipliers=np.ones(len(cat.codes)),
+            multipliers=np.ones(len(codes)),
         )
     def __rmul__(self, other):
@@ -674,7 +671,7 @@ def _C(
     data,
     *,
     levels: Optional[Iterable[str]] = None,
-    missing_method: str = "fail",
+    missing_method: Optional[str] = None,
     missing_name: str = "(MISSING)",
     spans_intercept: bool = True,
 ):
@@ -694,12 +691,13 @@ def _C(
         model_spec: ModelSpec,
     ):
         if drop_rows:
-            values = values.drop(index=values.index[drop_rows])
+            values = drop_nulls(values, indices=drop_rows)
         return encode_contrasts(
             values,
             levels=levels,
             reduced_rank=reduced_rank,
-            missing_method=missing_method,
+            missing_method=missing_method
+            or model_spec.materializer_params.get("cat_missing_method", "fail"),  # type: ignore
             missing_name=missing_name,
             _state=encoder_state,
             _spec=model_spec,
@@ -715,14 +713,14 @@ def _C(
 @stateful_transform
 def encode_contrasts(
-    data,
+    data: nw.Series,
     *,
     levels: Optional[Iterable[str]] = None,
     missing_method: str = "fail",
     missing_name: str = "(MISSING)",
     reduced_rank: bool = False,
-    _state=None,
-    _spec=None,
+    _state: dict[str, Any] = {},
+    _spec: Optional[ModelSpec] = None,
 ) -> FactorValues[_InteractableCategoricalVector]:
     """
     Encode a categorical dataset into one an _InteractableCategoricalVector
@@ -738,6 +736,13 @@ def encode_contrasts(
     levels = levels if levels is not None else _state.get("categories")
     add_missing_category = _state.get("add_missing_category", False)
+    if data.dtype.is_numeric():
+        # Polars enums only support string values
+        data = data.cast(nw.String)
+        # Convert levels to strings as well to match data type
+        if levels is not None:
+            levels = [str(level) for level in levels]
     # Check for unseen categories when levels are specified
     if levels is not None:
         if missing_method == "convert" and not add_missing_category:
@@ -746,21 +751,28 @@ def encode_contrasts(
             #  - missings are no problem in the other cases
             unseen_categories = set(data.unique()) - set(levels)
         else:
-            unseen_categories = set(data.dropna().unique()) - set(levels)
+            unseen_categories = set(data.drop_nulls().unique()) - set(levels)
         if unseen_categories:
             raise ValueError(
                 f"Column {data.name} contains unseen categories: {unseen_categories}."
             )
+    else:
+        # Not super efficient as we do it again in _extract_codes_and_categories
+        levels = list(data.drop_nulls().unique().sort())
+    cat = data.cast(nw.Enum(levels))
+    codes, categories = _extract_codes_and_categories(cat)
+    categories = list(categories)
-    cat = pd.Categorical(data._values, categories=levels)
-    _state["categories"] = cat.categories
-    _state["add_missing_category"] = add_missing_category or (
-        missing_method == "convert" and cat.isna().any()
+    _state["categories"] = categories
+    _state["add_missing_category"] = add_missing_category or bool(
+        missing_method == "convert" and cat.is_null().any()
     )
-    return _InteractableCategoricalVector.from_categorical(
-        cat,
+    return _InteractableCategoricalVector.from_codes(
+        codes=codes,
+        categories=categories,
         reduced_rank=reduced_rank,
         missing_method=missing_method,
         missing_name=missing_name,

tabmat/split_matrix.py CHANGED Viewed

@@ -527,7 +527,7 @@ class SplitMatrix(MatrixBase):
         list[Optional[str]]
             Column names.
         """
-        names = np.empty(self.shape[1], dtype=object)
+        names: np.ndarray = np.empty(self.shape[1], dtype=object)
         for idx, mat in zip(self.indices, self.matrices):
             names[idx] = mat.get_names(type, missing_prefix, idx)
         return names.tolist()

{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,20 +1,19 @@
 Metadata-Version: 2.4
 Name: tabmat
-Version: 4.1.5
+Version: 4.2.1
 Summary: Efficient matrix representations for working with tabular data.
 Home-page: https://github.com/Quantco/tabmat
 Author: QuantCo, Inc.
 Author-email: noreply@quantco.com
 Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: formulaic>=0.6
+Requires-Dist: formulaic>=1.2
 Requires-Dist: narwhals
 Requires-Dist: numpy
 Requires-Dist: scipy

tabmat-4.2.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,26 @@
+tabmat/__init__.py,sha256=uvBIwdxTmQCtfs3kfX75sMYGg0wBRNe0rUyQ3cvhU7M,759
+tabmat/categorical_matrix.py,sha256=MaFcyO7LmIuXRpyIjuCgbEI6ScZBL-0fibOGslGqZvU,33170
+tabmat/constructor.py,sha256=YO_8SRP8sZlxx4ELgcpZ5sRYW30lex6kujejQVcozaY,16582
+tabmat/constructor_util.py,sha256=IRtDJ0EQKiZVPH2aUvOuftPpgQog-kmq6dItGx8TPf8,1753
+tabmat/dense_matrix.py,sha256=-iQseyKovWyXHvPWg1w2URYNJbTKAOEVy34fHGRtnBU,12375
+tabmat/formula.py,sha256=lor2w9iswrfwKzw2qINAS-kCVelVVXhkR2pbfWXs5ho,29292
+tabmat/matrix_base.py,sha256=NIbL5myAyVDoL5RQhSod8BKX9kbVGTunggN7uMBLjD4,8192
+tabmat/sparse_matrix.py,sha256=wVgz7w8QWPskSzJyMSKo8O0B5NkC8h027BXgpmbToCc,14377
+tabmat/split_matrix.py,sha256=jRjnX0JU_KA29_Cm2fZWmRxgq7vueGDf6vX2UuaY5qI,21593
+tabmat/standardized_mat.py,sha256=ogrHF2vG6Es0WSNgcMNukDAlYq7R4yzB4-HuoovUnCE,13514
+tabmat/util.py,sha256=xxX5S5-m4Jy8Nr56T7IuLibZZcgUc8-xlU84N5xQZNA,4175
+tabmat/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tabmat/benchmark/generate_matrices.py,sha256=1H-Y3SngnM7HgUT_YkOezPfnSdD9NwlvlJviKF4Aq1E,4923
+tabmat/benchmark/main.py,sha256=auwBIbBXM8nOtDkbLV95Da52Wk52Jw5ayJbX_Z6FK8Q,12420
+tabmat/benchmark/memory_tools.py,sha256=mFO-86Shxs1WmDf2--Xbxa3z3hTz4WmbKbFdrb3bxA8,1385
+tabmat/benchmark/visualize_benchmarks.py,sha256=-fr4rDOKtmCHsXTvuk_aF_m3egUR7z_ac3OiG4ebx-Q,5922
+tabmat/ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+tabmat/ext/categorical.cp310-win_amd64.pyd,sha256=-qLSl_fI-wKm3aPyYaHOao-gYgqBPaRkRYugWGu5r70,310784
+tabmat/ext/dense.cp310-win_amd64.pyd,sha256=nfF1ZeHf4wmrNCULKrl2XqwPp77CnlxfmCFBj8iUFs4,262144
+tabmat/ext/sparse.cp310-win_amd64.pyd,sha256=bak26vu4dDLWJg7ERsPylWNfGlR5P27gEk50itsT4Sg,672256
+tabmat/ext/split.cp310-win_amd64.pyd,sha256=RGkDFGCM7TQ447jE53Ds9GBx-4chePwaM116acksjLo,245248
+tabmat-4.2.1.dist-info/licenses/LICENSE,sha256=5NI6VxMb_AfgPHL2nQttFTurJZWQ8ks5UWTpNEsD9K0,1472
+tabmat-4.2.1.dist-info/METADATA,sha256=2neapQHbY3HbdtA0GGICTdkWDwo4DV2oJ-h3fqfZVM0,6227
+tabmat-4.2.1.dist-info/WHEEL,sha256=lVtJYX4SZwMxwg8oP4kB_UdF4VQRXLlqu7hUy_2nnAE,102
+tabmat-4.2.1.dist-info/top_level.txt,sha256=dAA5sdRVvDsNOvJd8hxwmetHTmRhp_Y08al7_gHqs9c,7
+tabmat-4.2.1.dist-info/RECORD,,

{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.9.0)
+Generator: setuptools (80.10.2)
 Root-Is-Purelib: false
 Tag: cp310-cp310-win_amd64

tabmat-4.1.5.dist-info/RECORD DELETED Viewed

@@ -1,26 +0,0 @@
-tabmat/__init__.py,sha256=uvBIwdxTmQCtfs3kfX75sMYGg0wBRNe0rUyQ3cvhU7M,759
-tabmat/categorical_matrix.py,sha256=Sl1Zkk_E3A5uKbUg4GXrQSPOAjbStPKAkKdsRuSiSEs,32924
-tabmat/constructor.py,sha256=MXquP1UfhZk6mZdmiigmMsvn8UMdKF05dU7HlggES9M,16570
-tabmat/constructor_util.py,sha256=IRtDJ0EQKiZVPH2aUvOuftPpgQog-kmq6dItGx8TPf8,1753
-tabmat/dense_matrix.py,sha256=-iQseyKovWyXHvPWg1w2URYNJbTKAOEVy34fHGRtnBU,12375
-tabmat/formula.py,sha256=lrciexxxcNl_imYMQY7GcNwmKq1GKCWgo-yPzmEk_Ps,28744
-tabmat/matrix_base.py,sha256=NIbL5myAyVDoL5RQhSod8BKX9kbVGTunggN7uMBLjD4,8192
-tabmat/sparse_matrix.py,sha256=wVgz7w8QWPskSzJyMSKo8O0B5NkC8h027BXgpmbToCc,14377
-tabmat/split_matrix.py,sha256=N-3XboRNN-pYsZuds74-ldxODDqexeHT5CL-9IxmjJ8,21581
-tabmat/standardized_mat.py,sha256=ogrHF2vG6Es0WSNgcMNukDAlYq7R4yzB4-HuoovUnCE,13514
-tabmat/util.py,sha256=xxX5S5-m4Jy8Nr56T7IuLibZZcgUc8-xlU84N5xQZNA,4175
-tabmat/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tabmat/benchmark/generate_matrices.py,sha256=MlMwENeE26D22VvAAqowjHBOlaRnvdXXePlUw54L1pI,4907
-tabmat/benchmark/main.py,sha256=auwBIbBXM8nOtDkbLV95Da52Wk52Jw5ayJbX_Z6FK8Q,12420
-tabmat/benchmark/memory_tools.py,sha256=mFO-86Shxs1WmDf2--Xbxa3z3hTz4WmbKbFdrb3bxA8,1385
-tabmat/benchmark/visualize_benchmarks.py,sha256=-fr4rDOKtmCHsXTvuk_aF_m3egUR7z_ac3OiG4ebx-Q,5922
-tabmat/ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tabmat/ext/categorical.cp310-win_amd64.pyd,sha256=G_RDdyQYtOPKg6HgPDMdx4MtymbgC6EiWKZYS3XWwI8,310784
-tabmat/ext/dense.cp310-win_amd64.pyd,sha256=_Sh2YfTvlIuuQLYMjJnaXPVlM4nx-uAgkS8i_wvmbcw,262656
-tabmat/ext/sparse.cp310-win_amd64.pyd,sha256=QE787ILvpsJJJnpmiZL7TIjcXoBM4TOMTsL8byAO-7Y,672256
-tabmat/ext/split.cp310-win_amd64.pyd,sha256=zBm1Whx0OQEmOz2kN9bBQO617i-koYTz8YTJAOG7mIQ,245248
-tabmat-4.1.5.dist-info/licenses/LICENSE,sha256=5NI6VxMb_AfgPHL2nQttFTurJZWQ8ks5UWTpNEsD9K0,1472
-tabmat-4.1.5.dist-info/METADATA,sha256=bfh1IMj4VTbXdgTvwc5gewg5b_8nI3IvmF3kbXmIVyU,6277
-tabmat-4.1.5.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
-tabmat-4.1.5.dist-info/top_level.txt,sha256=dAA5sdRVvDsNOvJd8hxwmetHTmRhp_Y08al7_gHqs9c,7
-tabmat-4.1.5.dist-info/RECORD,,

{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{tabmat-4.1.5.dist-info → tabmat-4.2.1.dist-info}/top_level.txt RENAMED Viewed

File without changes