tabmat 4.1.5__cp310-cp310-win_amd64.whl → 4.2.1__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@ def make_dense_matrices(n_rows: int, n_cols: int) -> dict:
13
13
  dense_matrices = {"numpy_C": np.random.random((n_rows, n_cols))}
14
14
  dense_matrices["numpy_F"] = dense_matrices["numpy_C"].copy(order="F")
15
15
  assert dense_matrices["numpy_F"].flags["F_CONTIGUOUS"]
16
- dense_matrices["tabmat"] = tm.DenseMatrix(dense_matrices["numpy_C"])
16
+ dense_matrices["tabmat"] = tm.DenseMatrix(dense_matrices["numpy_C"]) # type: ignore
17
17
  return dense_matrices
18
18
 
19
19
 
@@ -240,7 +240,13 @@ def _extract_codes_and_categories_pandas(cat_vec) -> tuple[np.ndarray, np.ndarra
240
240
 
241
241
 
242
242
  def _extract_codes_and_categories_polars(cat_vec) -> tuple[np.ndarray, np.ndarray]:
243
- if not isinstance(cat_vec.dtype, (pl.Categorical, pl.Enum)):
243
+ dtype = cat_vec.dtype
244
+ if isinstance(dtype, pl.Enum):
245
+ categories = cat_vec.cat.get_categories().to_numpy()
246
+ indices = cat_vec.to_physical().fill_null(-1).to_numpy()
247
+ return indices, categories
248
+
249
+ if not isinstance(cat_vec.dtype, pl.Categorical):
244
250
  cat_vec = cat_vec.cast(pl.Categorical)
245
251
  # as of polars 1.32, `get_categories()` won't yield a useful result as
246
252
  # this is "not per column" anymore.
@@ -300,7 +306,7 @@ def _row_col_indexing(
300
306
  is_col_indexed = not (cols is None or len(cols) == arr.shape[1])
301
307
 
302
308
  if is_row_indexed and is_col_indexed:
303
- return arr[np.ix_(rows, cols)]
309
+ return arr[np.ix_(rows, cols)] # type: ignore
304
310
  elif is_row_indexed:
305
311
  return arr[rows]
306
312
  elif is_col_indexed:
@@ -691,7 +697,7 @@ class CategoricalMatrix(MatrixBase):
691
697
  )
692
698
 
693
699
  # TODO: data should be uint8
694
- data = np.ones(self.shape[0], dtype=int)
700
+ data: np.ndarray = np.ones(self.shape[0], dtype=int)
695
701
  return sps.csr_matrix(
696
702
  (data, self.indices, np.arange(self.shape[0] + 1, dtype=int)),
697
703
  shape=self.shape,
tabmat/constructor.py CHANGED
@@ -396,7 +396,7 @@ def from_formula(
396
396
  )
397
397
  result = materializer.get_model_matrix(spec)
398
398
 
399
- term_names = np.zeros(len(result.term_names), dtype="object")
399
+ term_names: np.ndarray = np.zeros(len(result.term_names), dtype="object")
400
400
  for term, indices in result.model_spec.term_indices.items():
401
401
  term_names[indices] = str(term)
402
402
  result.term_names = term_names.tolist()
Binary file
Binary file
Binary file
Binary file
tabmat/formula.py CHANGED
@@ -5,19 +5,21 @@ from collections import OrderedDict
5
5
  from collections.abc import Iterable
6
6
  from typing import Any, Optional, Union
7
7
 
8
+ import narwhals.stable.v2 as nw
8
9
  import numpy as np
9
10
  import numpy.typing
10
11
  import pandas as pd
11
12
  from formulaic import ModelMatrix, ModelSpec
12
13
  from formulaic.errors import FactorEncodingError
13
14
  from formulaic.materializers import FormulaMaterializer
14
- from formulaic.materializers.types import FactorValues, NAAction, ScopedTerm
15
+ from formulaic.materializers.types import FactorValues, ScopedTerm
15
16
  from formulaic.parser.types import Term
16
17
  from formulaic.transforms import stateful_transform
18
+ from formulaic.utils.null_handling import drop_rows as drop_nulls
17
19
  from interface_meta import override
18
20
  from scipy import sparse as sps
19
21
 
20
- from .categorical_matrix import CategoricalMatrix
22
+ from .categorical_matrix import CategoricalMatrix, _extract_codes_and_categories
21
23
  from .constructor_util import _split_sparse_and_dense_parts
22
24
  from .dense_matrix import DenseMatrix
23
25
  from .matrix_base import MatrixBase
@@ -53,34 +55,24 @@ class TabmatMaterializer(FormulaMaterializer):
53
55
  self.cat_missing_method = self.params.get("cat_missing_method", "fail")
54
56
  self.cat_missing_name = self.params.get("cat_missing_name", "(MISSING)")
55
57
 
58
+ # Always convert input to narwhals DataFrame
59
+ self.__narwhals_data = nw.from_native(self.data, eager_only=True)
60
+ self.__data_context = self.__narwhals_data.to_dict()
61
+
56
62
  # We can override formulaic's C() function here
57
63
  self.context["C"] = _C
58
64
 
59
- @override
60
- def _is_categorical(self, values):
61
- if isinstance(values, (pd.Series, pd.Categorical)):
62
- return values.dtype == object or isinstance(
63
- values.dtype, (pd.CategoricalDtype, pd.StringDtype)
64
- )
65
- return super()._is_categorical(values)
65
+ @override # type: ignore
66
+ @property
67
+ def data_context(self):
68
+ return self.__data_context
66
69
 
67
70
  @override
68
- def _check_for_nulls(self, name, values, na_action, drop_rows):
69
- if na_action is NAAction.IGNORE:
70
- return
71
-
72
- if na_action is NAAction.RAISE:
73
- if isinstance(values, pd.Series) and values.isnull().values.any():
74
- raise ValueError(f"`{name}` contains null values after evaluation.")
75
-
76
- elif na_action is NAAction.DROP:
77
- if isinstance(values, pd.Series):
78
- drop_rows.update(np.flatnonzero(values.isnull().values))
79
-
80
- else:
81
- raise ValueError(
82
- f"Do not know how to interpret `na_action` = {repr(na_action)}."
83
- )
71
+ def _is_categorical(self, values: Any) -> bool:
72
+ if nw.dependencies.is_narwhals_series(values):
73
+ if not values.dtype.is_numeric():
74
+ return True
75
+ return super()._is_categorical(values)
84
76
 
85
77
  @override
86
78
  def _encode_constant(self, value, metadata, encoder_state, spec, drop_rows):
@@ -90,9 +82,9 @@ class TabmatMaterializer(FormulaMaterializer):
90
82
  @override
91
83
  def _encode_numerical(self, values, metadata, encoder_state, spec, drop_rows):
92
84
  if drop_rows:
93
- values = values.drop(index=values.index[drop_rows])
94
- if isinstance(values, pd.Series):
95
- values = values.to_numpy().astype(self.dtype, copy=False)
85
+ values = drop_nulls(values, indices=drop_rows)
86
+ if isinstance(values, nw.Series):
87
+ values = values.to_numpy().astype(self.dtype)
96
88
  if (values != 0).mean() <= self.sparse_threshold:
97
89
  return _InteractableSparseVector(sps.csc_matrix(values[:, np.newaxis]))
98
90
  else:
@@ -104,7 +96,7 @@ class TabmatMaterializer(FormulaMaterializer):
104
96
  ):
105
97
  # We do not do any encoding here as it is handled by tabmat
106
98
  if drop_rows:
107
- values = values.drop(index=values.index[drop_rows])
99
+ values = drop_nulls(values, indices=drop_rows)
108
100
  return encode_contrasts(
109
101
  values,
110
102
  reduced_rank=reduced_rank,
@@ -428,17 +420,22 @@ class _InteractableCategoricalVector(_InteractableVector):
428
420
  self.name = name
429
421
 
430
422
  @classmethod
431
- def from_categorical(
423
+ def from_codes(
432
424
  cls,
433
- cat: pd.Categorical,
425
+ codes: np.ndarray,
426
+ categories: list,
434
427
  reduced_rank: bool,
435
428
  missing_method: str = "fail",
436
429
  missing_name: str = "(MISSING)",
437
430
  add_missing_category: bool = False,
438
431
  ) -> "_InteractableCategoricalVector":
439
- """Create an interactable categorical vector from a pandas categorical."""
440
- categories = cat.categories.tolist()
441
- codes = cat.codes.copy().astype(np.int64)
432
+ """Create an interactable categorical vector from integer codes and categories.
433
+
434
+ The `codes` array is expected to contain integer category codes, using
435
+ -1 for missing values and -2 for rows that should be dropped.
436
+ """
437
+ codes = codes.copy().astype(np.int64)
438
+ categories = categories.copy()
442
439
 
443
440
  if reduced_rank:
444
441
  codes[codes == 0] = -2
@@ -458,7 +455,7 @@ class _InteractableCategoricalVector(_InteractableVector):
458
455
  return cls(
459
456
  codes=codes,
460
457
  categories=categories,
461
- multipliers=np.ones(len(cat.codes)),
458
+ multipliers=np.ones(len(codes)),
462
459
  )
463
460
 
464
461
  def __rmul__(self, other):
@@ -674,7 +671,7 @@ def _C(
674
671
  data,
675
672
  *,
676
673
  levels: Optional[Iterable[str]] = None,
677
- missing_method: str = "fail",
674
+ missing_method: Optional[str] = None,
678
675
  missing_name: str = "(MISSING)",
679
676
  spans_intercept: bool = True,
680
677
  ):
@@ -694,12 +691,13 @@ def _C(
694
691
  model_spec: ModelSpec,
695
692
  ):
696
693
  if drop_rows:
697
- values = values.drop(index=values.index[drop_rows])
694
+ values = drop_nulls(values, indices=drop_rows)
698
695
  return encode_contrasts(
699
696
  values,
700
697
  levels=levels,
701
698
  reduced_rank=reduced_rank,
702
- missing_method=missing_method,
699
+ missing_method=missing_method
700
+ or model_spec.materializer_params.get("cat_missing_method", "fail"), # type: ignore
703
701
  missing_name=missing_name,
704
702
  _state=encoder_state,
705
703
  _spec=model_spec,
@@ -715,14 +713,14 @@ def _C(
715
713
 
716
714
  @stateful_transform
717
715
  def encode_contrasts(
718
- data,
716
+ data: nw.Series,
719
717
  *,
720
718
  levels: Optional[Iterable[str]] = None,
721
719
  missing_method: str = "fail",
722
720
  missing_name: str = "(MISSING)",
723
721
  reduced_rank: bool = False,
724
- _state=None,
725
- _spec=None,
722
+ _state: dict[str, Any] = {},
723
+ _spec: Optional[ModelSpec] = None,
726
724
  ) -> FactorValues[_InteractableCategoricalVector]:
727
725
  """
728
726
  Encode a categorical dataset into one an _InteractableCategoricalVector
@@ -738,6 +736,13 @@ def encode_contrasts(
738
736
  levels = levels if levels is not None else _state.get("categories")
739
737
  add_missing_category = _state.get("add_missing_category", False)
740
738
 
739
+ if data.dtype.is_numeric():
740
+ # Polars enums only support string values
741
+ data = data.cast(nw.String)
742
+ # Convert levels to strings as well to match data type
743
+ if levels is not None:
744
+ levels = [str(level) for level in levels]
745
+
741
746
  # Check for unseen categories when levels are specified
742
747
  if levels is not None:
743
748
  if missing_method == "convert" and not add_missing_category:
@@ -746,21 +751,28 @@ def encode_contrasts(
746
751
  # - missings are no problem in the other cases
747
752
  unseen_categories = set(data.unique()) - set(levels)
748
753
  else:
749
- unseen_categories = set(data.dropna().unique()) - set(levels)
754
+ unseen_categories = set(data.drop_nulls().unique()) - set(levels)
750
755
 
751
756
  if unseen_categories:
752
757
  raise ValueError(
753
758
  f"Column {data.name} contains unseen categories: {unseen_categories}."
754
759
  )
760
+ else:
761
+ # Not super efficient as we do it again in _extract_codes_and_categories
762
+ levels = list(data.drop_nulls().unique().sort())
763
+
764
+ cat = data.cast(nw.Enum(levels))
765
+ codes, categories = _extract_codes_and_categories(cat)
766
+ categories = list(categories)
755
767
 
756
- cat = pd.Categorical(data._values, categories=levels)
757
- _state["categories"] = cat.categories
758
- _state["add_missing_category"] = add_missing_category or (
759
- missing_method == "convert" and cat.isna().any()
768
+ _state["categories"] = categories
769
+ _state["add_missing_category"] = add_missing_category or bool(
770
+ missing_method == "convert" and cat.is_null().any()
760
771
  )
761
772
 
762
- return _InteractableCategoricalVector.from_categorical(
763
- cat,
773
+ return _InteractableCategoricalVector.from_codes(
774
+ codes=codes,
775
+ categories=categories,
764
776
  reduced_rank=reduced_rank,
765
777
  missing_method=missing_method,
766
778
  missing_name=missing_name,
tabmat/split_matrix.py CHANGED
@@ -527,7 +527,7 @@ class SplitMatrix(MatrixBase):
527
527
  list[Optional[str]]
528
528
  Column names.
529
529
  """
530
- names = np.empty(self.shape[1], dtype=object)
530
+ names: np.ndarray = np.empty(self.shape[1], dtype=object)
531
531
  for idx, mat in zip(self.indices, self.matrices):
532
532
  names[idx] = mat.get_names(type, missing_prefix, idx)
533
533
  return names.tolist()
@@ -1,20 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: tabmat
3
- Version: 4.1.5
3
+ Version: 4.2.1
4
4
  Summary: Efficient matrix representations for working with tabular data.
5
5
  Home-page: https://github.com/Quantco/tabmat
6
6
  Author: QuantCo, Inc.
7
7
  Author-email: noreply@quantco.com
8
8
  Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.9
10
9
  Classifier: Programming Language :: Python :: 3.10
11
10
  Classifier: Programming Language :: Python :: 3.11
12
11
  Classifier: Programming Language :: Python :: 3.12
13
12
  Classifier: Programming Language :: Python :: 3.13
14
- Requires-Python: >=3.9
13
+ Requires-Python: >=3.10
15
14
  Description-Content-Type: text/markdown
16
15
  License-File: LICENSE
17
- Requires-Dist: formulaic>=0.6
16
+ Requires-Dist: formulaic>=1.2
18
17
  Requires-Dist: narwhals
19
18
  Requires-Dist: numpy
20
19
  Requires-Dist: scipy
@@ -0,0 +1,26 @@
1
+ tabmat/__init__.py,sha256=uvBIwdxTmQCtfs3kfX75sMYGg0wBRNe0rUyQ3cvhU7M,759
2
+ tabmat/categorical_matrix.py,sha256=MaFcyO7LmIuXRpyIjuCgbEI6ScZBL-0fibOGslGqZvU,33170
3
+ tabmat/constructor.py,sha256=YO_8SRP8sZlxx4ELgcpZ5sRYW30lex6kujejQVcozaY,16582
4
+ tabmat/constructor_util.py,sha256=IRtDJ0EQKiZVPH2aUvOuftPpgQog-kmq6dItGx8TPf8,1753
5
+ tabmat/dense_matrix.py,sha256=-iQseyKovWyXHvPWg1w2URYNJbTKAOEVy34fHGRtnBU,12375
6
+ tabmat/formula.py,sha256=lor2w9iswrfwKzw2qINAS-kCVelVVXhkR2pbfWXs5ho,29292
7
+ tabmat/matrix_base.py,sha256=NIbL5myAyVDoL5RQhSod8BKX9kbVGTunggN7uMBLjD4,8192
8
+ tabmat/sparse_matrix.py,sha256=wVgz7w8QWPskSzJyMSKo8O0B5NkC8h027BXgpmbToCc,14377
9
+ tabmat/split_matrix.py,sha256=jRjnX0JU_KA29_Cm2fZWmRxgq7vueGDf6vX2UuaY5qI,21593
10
+ tabmat/standardized_mat.py,sha256=ogrHF2vG6Es0WSNgcMNukDAlYq7R4yzB4-HuoovUnCE,13514
11
+ tabmat/util.py,sha256=xxX5S5-m4Jy8Nr56T7IuLibZZcgUc8-xlU84N5xQZNA,4175
12
+ tabmat/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ tabmat/benchmark/generate_matrices.py,sha256=1H-Y3SngnM7HgUT_YkOezPfnSdD9NwlvlJviKF4Aq1E,4923
14
+ tabmat/benchmark/main.py,sha256=auwBIbBXM8nOtDkbLV95Da52Wk52Jw5ayJbX_Z6FK8Q,12420
15
+ tabmat/benchmark/memory_tools.py,sha256=mFO-86Shxs1WmDf2--Xbxa3z3hTz4WmbKbFdrb3bxA8,1385
16
+ tabmat/benchmark/visualize_benchmarks.py,sha256=-fr4rDOKtmCHsXTvuk_aF_m3egUR7z_ac3OiG4ebx-Q,5922
17
+ tabmat/ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ tabmat/ext/categorical.cp310-win_amd64.pyd,sha256=-qLSl_fI-wKm3aPyYaHOao-gYgqBPaRkRYugWGu5r70,310784
19
+ tabmat/ext/dense.cp310-win_amd64.pyd,sha256=nfF1ZeHf4wmrNCULKrl2XqwPp77CnlxfmCFBj8iUFs4,262144
20
+ tabmat/ext/sparse.cp310-win_amd64.pyd,sha256=bak26vu4dDLWJg7ERsPylWNfGlR5P27gEk50itsT4Sg,672256
21
+ tabmat/ext/split.cp310-win_amd64.pyd,sha256=RGkDFGCM7TQ447jE53Ds9GBx-4chePwaM116acksjLo,245248
22
+ tabmat-4.2.1.dist-info/licenses/LICENSE,sha256=5NI6VxMb_AfgPHL2nQttFTurJZWQ8ks5UWTpNEsD9K0,1472
23
+ tabmat-4.2.1.dist-info/METADATA,sha256=2neapQHbY3HbdtA0GGICTdkWDwo4DV2oJ-h3fqfZVM0,6227
24
+ tabmat-4.2.1.dist-info/WHEEL,sha256=lVtJYX4SZwMxwg8oP4kB_UdF4VQRXLlqu7hUy_2nnAE,102
25
+ tabmat-4.2.1.dist-info/top_level.txt,sha256=dAA5sdRVvDsNOvJd8hxwmetHTmRhp_Y08al7_gHqs9c,7
26
+ tabmat-4.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp310-cp310-win_amd64
5
5
 
@@ -1,26 +0,0 @@
1
- tabmat/__init__.py,sha256=uvBIwdxTmQCtfs3kfX75sMYGg0wBRNe0rUyQ3cvhU7M,759
2
- tabmat/categorical_matrix.py,sha256=Sl1Zkk_E3A5uKbUg4GXrQSPOAjbStPKAkKdsRuSiSEs,32924
3
- tabmat/constructor.py,sha256=MXquP1UfhZk6mZdmiigmMsvn8UMdKF05dU7HlggES9M,16570
4
- tabmat/constructor_util.py,sha256=IRtDJ0EQKiZVPH2aUvOuftPpgQog-kmq6dItGx8TPf8,1753
5
- tabmat/dense_matrix.py,sha256=-iQseyKovWyXHvPWg1w2URYNJbTKAOEVy34fHGRtnBU,12375
6
- tabmat/formula.py,sha256=lrciexxxcNl_imYMQY7GcNwmKq1GKCWgo-yPzmEk_Ps,28744
7
- tabmat/matrix_base.py,sha256=NIbL5myAyVDoL5RQhSod8BKX9kbVGTunggN7uMBLjD4,8192
8
- tabmat/sparse_matrix.py,sha256=wVgz7w8QWPskSzJyMSKo8O0B5NkC8h027BXgpmbToCc,14377
9
- tabmat/split_matrix.py,sha256=N-3XboRNN-pYsZuds74-ldxODDqexeHT5CL-9IxmjJ8,21581
10
- tabmat/standardized_mat.py,sha256=ogrHF2vG6Es0WSNgcMNukDAlYq7R4yzB4-HuoovUnCE,13514
11
- tabmat/util.py,sha256=xxX5S5-m4Jy8Nr56T7IuLibZZcgUc8-xlU84N5xQZNA,4175
12
- tabmat/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- tabmat/benchmark/generate_matrices.py,sha256=MlMwENeE26D22VvAAqowjHBOlaRnvdXXePlUw54L1pI,4907
14
- tabmat/benchmark/main.py,sha256=auwBIbBXM8nOtDkbLV95Da52Wk52Jw5ayJbX_Z6FK8Q,12420
15
- tabmat/benchmark/memory_tools.py,sha256=mFO-86Shxs1WmDf2--Xbxa3z3hTz4WmbKbFdrb3bxA8,1385
16
- tabmat/benchmark/visualize_benchmarks.py,sha256=-fr4rDOKtmCHsXTvuk_aF_m3egUR7z_ac3OiG4ebx-Q,5922
17
- tabmat/ext/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
- tabmat/ext/categorical.cp310-win_amd64.pyd,sha256=G_RDdyQYtOPKg6HgPDMdx4MtymbgC6EiWKZYS3XWwI8,310784
19
- tabmat/ext/dense.cp310-win_amd64.pyd,sha256=_Sh2YfTvlIuuQLYMjJnaXPVlM4nx-uAgkS8i_wvmbcw,262656
20
- tabmat/ext/sparse.cp310-win_amd64.pyd,sha256=QE787ILvpsJJJnpmiZL7TIjcXoBM4TOMTsL8byAO-7Y,672256
21
- tabmat/ext/split.cp310-win_amd64.pyd,sha256=zBm1Whx0OQEmOz2kN9bBQO617i-koYTz8YTJAOG7mIQ,245248
22
- tabmat-4.1.5.dist-info/licenses/LICENSE,sha256=5NI6VxMb_AfgPHL2nQttFTurJZWQ8ks5UWTpNEsD9K0,1472
23
- tabmat-4.1.5.dist-info/METADATA,sha256=bfh1IMj4VTbXdgTvwc5gewg5b_8nI3IvmF3kbXmIVyU,6277
24
- tabmat-4.1.5.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
25
- tabmat-4.1.5.dist-info/top_level.txt,sha256=dAA5sdRVvDsNOvJd8hxwmetHTmRhp_Y08al7_gHqs9c,7
26
- tabmat-4.1.5.dist-info/RECORD,,