spforge 0.8.33__py3-none-any.whl → 0.8.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

@@ -6,5 +6,6 @@ from ._performances_transformers import (
6
6
  DiminishingValueTransformer as DiminishingValueTransformer,
7
7
  MinMaxTransformer as MinMaxTransformer,
8
8
  PartialStandardScaler as PartialStandardScaler,
9
+ QuantilePerformanceScaler as QuantilePerformanceScaler,
9
10
  SymmetricDistributionTransformer as SymmetricDistributionTransformer,
10
11
  )
@@ -4,6 +4,7 @@ from dataclasses import dataclass
4
4
  from typing import Literal
5
5
 
6
6
  import narwhals.stable.v2 as nw
7
+ import numpy as np
7
8
  from narwhals.typing import IntoFrameT
8
9
  from sklearn.base import BaseEstimator, TransformerMixin
9
10
 
@@ -11,6 +12,7 @@ from spforge.performance_transformers._performances_transformers import (
11
12
  MinMaxTransformer,
12
13
  NarwhalsFeatureTransformer,
13
14
  PartialStandardScaler,
15
+ QuantilePerformanceScaler,
14
16
  SymmetricDistributionTransformer,
15
17
  )
16
18
 
@@ -86,9 +88,12 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
86
88
  prefix: str = "performance__",
87
89
  min_value: float = -0.02,
88
90
  max_value: float = 1.02,
91
+ zero_inflation_threshold: float = 0.15,
89
92
  ):
90
93
  self.features = features
91
94
  self.prefix = prefix
95
+ # Store whether user explicitly disabled transformers (passed empty list)
96
+ self._user_disabled_transformers = transformer_names is not None and len(transformer_names) == 0
92
97
  self.transformer_names = transformer_names or [
93
98
  "symmetric",
94
99
  "partial_standard_scaler",
@@ -100,6 +105,7 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
100
105
  self.performance_column = self.prefix + performance_column
101
106
  self.min_value = min_value
102
107
  self.max_value = max_value
108
+ self.zero_inflation_threshold = zero_inflation_threshold
103
109
 
104
110
  self.transformers = create_performance_scalers_transformers(
105
111
  transformer_names=self.transformer_names,
@@ -107,9 +113,47 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
107
113
  features=self.features,
108
114
  prefix=self.prefix,
109
115
  )
116
+ self._using_quantile_scaler = False
110
117
 
111
118
  @nw.narwhalify
112
119
  def fit(self, df: IntoFrameT, y=None):
120
+ # Check for zero-inflated distributions and swap to quantile scaler if needed
121
+ # Only apply when user hasn't explicitly disabled transformers (passed empty list)
122
+ if self.zero_inflation_threshold > 0 and not self._user_disabled_transformers:
123
+ df = self._ensure_inputs_exist(df, self.transformers[0])
124
+ prefixed_features = [self.prefix + f for f in self.features]
125
+
126
+ for feature in prefixed_features:
127
+ if feature in df.columns:
128
+ values = df[feature].to_numpy()
129
+ values = values[np.isfinite(values)]
130
+
131
+ # Skip if binary/categorical data (few unique values)
132
+ # Quantile scaler is for continuous zero-inflated data, not binary outcomes
133
+ n_unique = len(np.unique(values))
134
+ if n_unique <= 3:
135
+ continue
136
+
137
+ zero_proportion = np.mean(np.abs(values) < 1e-10)
138
+
139
+ if zero_proportion > self.zero_inflation_threshold:
140
+ logging.info(
141
+ f"Detected zero-inflated distribution for {feature} "
142
+ f"({zero_proportion:.1%} zeros). Using QuantilePerformanceScaler."
143
+ )
144
+ self._using_quantile_scaler = True
145
+ # Use original_transformers (deepcopy made before standard transformers
146
+ # were appended to custom_transformers)
147
+ self.transformers = [
148
+ copy.deepcopy(t) for t in self.original_transformers
149
+ ] + [
150
+ QuantilePerformanceScaler(
151
+ features=prefixed_features,
152
+ prefix="",
153
+ )
154
+ ]
155
+ break
156
+
113
157
  for t in self.transformers:
114
158
  df = self._ensure_inputs_exist(df, t)
115
159
  t.fit(df)
@@ -169,6 +213,7 @@ class PerformanceWeightsManager(PerformanceManager):
169
213
  min_value: float = -0.02,
170
214
  prefix: str = "performance__",
171
215
  return_all_features: bool = False,
216
+ zero_inflation_threshold: float = 0.15,
172
217
  ):
173
218
  self.weights = weights
174
219
  self.return_all_features = return_all_features
@@ -181,6 +226,7 @@ class PerformanceWeightsManager(PerformanceManager):
181
226
  max_value=max_value,
182
227
  min_value=min_value,
183
228
  performance_column=performance_column,
229
+ zero_inflation_threshold=zero_inflation_threshold,
184
230
  )
185
231
 
186
232
  @nw.narwhalify
@@ -3,6 +3,7 @@ from typing import Literal, Protocol
3
3
 
4
4
  import narwhals
5
5
  import narwhals.stable.v2 as nw
6
+ import numpy as np
6
7
  from lightgbm import LGBMRegressor
7
8
  from narwhals.typing import IntoFrameT
8
9
  from sklearn.base import BaseEstimator, TransformerMixin
@@ -420,3 +421,82 @@ class GroupByTransformer(BaseEstimator, TransformerMixin):
420
421
  @nw.narwhalify
421
422
  def transform(self, df: IntoFrameT) -> IntoFrameT:
422
423
  return df.join(self._grouped, on=self.granularity, how="left").to_native()
424
+
425
+
426
+ class QuantilePerformanceScaler(BaseEstimator, TransformerMixin):
427
+ """
428
+ Quantile-based scaling for zero-inflated distributions.
429
+
430
+ Uses probability integral transform:
431
+ - Zeros → π/2 (midpoint of zero probability mass)
432
+ - Non-zeros → uniform on (π, 1) via empirical CDF
433
+
434
+ Fast: O(n log n) for fit, O(n) for transform.
435
+ """
436
+
437
+ def __init__(
438
+ self,
439
+ features: list[str],
440
+ zero_threshold: float = 1e-10,
441
+ n_quantiles: int = 1000,
442
+ prefix: str = "",
443
+ ):
444
+ self.features = features
445
+ self.zero_threshold = zero_threshold
446
+ self.n_quantiles = n_quantiles
447
+ self.prefix = prefix
448
+ self.features_out = [self.prefix + f for f in self.features]
449
+
450
+ self._zero_proportion: dict[str, float] = {}
451
+ self._nonzero_quantiles: dict[str, np.ndarray | None] = {}
452
+
453
+ @nw.narwhalify
454
+ def fit(self, df: IntoFrameT, y=None):
455
+ for feature in self.features:
456
+ values = df[feature].to_numpy()
457
+ values = values[np.isfinite(values)]
458
+
459
+ is_zero = np.abs(values) < self.zero_threshold
460
+ self._zero_proportion[feature] = np.mean(is_zero)
461
+
462
+ nonzero_values = values[~is_zero]
463
+ if len(nonzero_values) > 0:
464
+ percentiles = np.linspace(0, 100, self.n_quantiles + 1)
465
+ self._nonzero_quantiles[feature] = np.percentile(nonzero_values, percentiles)
466
+ else:
467
+ self._nonzero_quantiles[feature] = None
468
+ return self
469
+
470
+ @nw.narwhalify
471
+ def transform(self, df: IntoFrameT) -> IntoFrameT:
472
+ for feature in self.features:
473
+ out_feature = self.prefix + feature
474
+ values = df[feature].to_numpy()
475
+ result = np.full_like(values, np.nan, dtype=float)
476
+
477
+ # Handle NaN explicitly - preserve NaN in output
478
+ is_finite = np.isfinite(values)
479
+ is_zero = is_finite & (np.abs(values) < self.zero_threshold)
480
+ is_nonzero = is_finite & ~is_zero
481
+
482
+ pi = self._zero_proportion[feature]
483
+
484
+ # Zeros → midpoint of zero mass
485
+ result[is_zero] = pi / 2
486
+
487
+ # Non-zeros → interpolate to (π, 1)
488
+ nonzero_quantiles = self._nonzero_quantiles[feature]
489
+ if nonzero_quantiles is not None and np.any(is_nonzero):
490
+ nonzero_values = np.clip(
491
+ values[is_nonzero], nonzero_quantiles[0], nonzero_quantiles[-1]
492
+ )
493
+ ranks = np.interp(
494
+ nonzero_values,
495
+ nonzero_quantiles,
496
+ np.linspace(0, 1, len(nonzero_quantiles)),
497
+ )
498
+ result[is_nonzero] = pi + (1 - pi) * ranks
499
+
500
+ df = df.with_columns(**{out_feature: result})
501
+
502
+ return df.to_native()
@@ -29,6 +29,7 @@ from spforge.ratings._base import RatingGenerator, RatingKnownFeatures, RatingUn
29
29
  from spforge.ratings.start_rating_generator import StartRatingGenerator
30
30
  from spforge.ratings.utils import (
31
31
  add_opp_team_rating,
32
+ add_player_opponent_mean_projected,
32
33
  add_rating_difference_projected,
33
34
  add_rating_mean_projected,
34
35
  add_team_rating,
@@ -141,6 +142,9 @@ class PlayerRatingGenerator(RatingGenerator):
141
142
  self.PLAYER_DIFF_FROM_TEAM_PROJ_COL = self._suffix(
142
143
  str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED)
143
144
  )
145
+ self.PLAYER_OPP_MEAN_PROJ_COL = self._suffix(
146
+ str(RatingKnownFeatures.PLAYER_OPPONENT_MEAN_PROJECTED)
147
+ )
144
148
 
145
149
  self.TEAM_OFF_RATING_PROJ_COL = self._suffix(
146
150
  str(RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED)
@@ -844,6 +848,7 @@ class PlayerRatingGenerator(RatingGenerator):
844
848
  or self.OPP_RATING_PROJ_COL in cols_to_add
845
849
  or self.DIFF_PROJ_COL in cols_to_add
846
850
  or self.PLAYER_DIFF_PROJ_COL in cols_to_add
851
+ or self.PLAYER_OPP_MEAN_PROJ_COL in cols_to_add
847
852
  ):
848
853
  df = add_team_rating_projected(
849
854
  df=df,
@@ -865,6 +870,7 @@ class PlayerRatingGenerator(RatingGenerator):
865
870
  or self.OPP_RATING_PROJ_COL in cols_to_add
866
871
  or self.DIFF_PROJ_COL in cols_to_add
867
872
  or self.PLAYER_DIFF_PROJ_COL in cols_to_add
873
+ or self.PLAYER_OPP_MEAN_PROJ_COL in cols_to_add
868
874
  ):
869
875
  df = add_opp_team_rating(
870
876
  df=df,
@@ -925,6 +931,15 @@ class PlayerRatingGenerator(RatingGenerator):
925
931
  rating_mean_out=self.MEAN_PROJ_COL,
926
932
  )
927
933
 
934
+ if self.PLAYER_OPP_MEAN_PROJ_COL in cols_to_add:
935
+ df = add_player_opponent_mean_projected(
936
+ df=df,
937
+ column_names=cn,
938
+ player_rating_col=self.PLAYER_RATING_COL,
939
+ opp_team_rating_col=self.OPP_RATING_PROJ_COL,
940
+ out_col=self.PLAYER_OPP_MEAN_PROJ_COL,
941
+ )
942
+
928
943
  if self.DIFF_COL in cols_to_add and self.DIFF_COL not in df.columns:
929
944
  if self.TEAM_RATING_COL not in df.columns:
930
945
  df = add_team_rating(
spforge/ratings/enums.py CHANGED
@@ -17,6 +17,7 @@ class RatingKnownFeatures(StrEnum):
17
17
  PLAYER_RATING_DIFFERENCE_PROJECTED = "player_rating_difference_projected"
18
18
  TEAM_RATING_DIFFERENCE_PROJECTED = "team_rating_difference_projected"
19
19
  RATING_MEAN_PROJECTED = "rating_mean_projected"
20
+ PLAYER_OPPONENT_MEAN_PROJECTED = "player_opponent_mean_projected"
20
21
  TEAM_LEAGUE = "team_league"
21
22
  PLAYER_LEAGUE = "player_league"
22
23
  OPPONENT_LEAGUE = "opponent_league"
spforge/ratings/utils.py CHANGED
@@ -137,3 +137,16 @@ def add_rating_mean_projected(
137
137
  )
138
138
 
139
139
  return df.with_columns(pl.col(player_rating_col).mean().over(mid).alias(rating_mean_out))
140
+
141
+
142
+ def add_player_opponent_mean_projected(
143
+ df: pl.DataFrame,
144
+ column_names: ColumnNames,
145
+ player_rating_col: str,
146
+ opp_team_rating_col: str,
147
+ out_col: str,
148
+ ) -> pl.DataFrame:
149
+ """Mean of player rating and opponent team rating."""
150
+ return df.with_columns(
151
+ ((pl.col(player_rating_col) + pl.col(opp_team_rating_col)) / 2).alias(out_col)
152
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.33
3
+ Version: 0.8.35
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -46,21 +46,21 @@ spforge/feature_generator/_utils.py,sha256=KDn33ia1OYJTK8THFpvc_uRiH_Bl3fImGqqbf
46
46
  spforge/hyperparameter_tuning/__init__.py,sha256=Vcl8rVlJ7M708iPgqe4XxpZWgJKGux0Y5HgMCymRsHg,1099
47
47
  spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=SjwXLpvYIu_JY8uPRHeL5Kgp1aa0slWDz8qsKDaohWQ,8020
48
48
  spforge/hyperparameter_tuning/_tuner.py,sha256=M79q3saM6r0UZJsRUUgfdDr-3Qii-F2-wuSAZLFtZDo,19246
49
- spforge/performance_transformers/__init__.py,sha256=U6d7_kltbUMLYCGBk4QAFVPJTxXD3etD9qUftV-O3q4,422
50
- spforge/performance_transformers/_performance_manager.py,sha256=WmjmlMEnq7y75MiI_s9Y-9eMXIyhPTUKrwsXRtgYp0k,9620
51
- spforge/performance_transformers/_performances_transformers.py,sha256=0lxuWjAfWBRXRgQsNJHjw3P-nlTtHBu4_bOVdoy7hq4,15536
49
+ spforge/performance_transformers/__init__.py,sha256=J-5olqi1M_BUj3sN1NqAz9s28XAbuKK9M9xHq7IGlQU,482
50
+ spforge/performance_transformers/_performance_manager.py,sha256=tR_4laGoC_KFRaw3Gy0TMI-r5gnicDmvmxPEgAvl4E0,12031
51
+ spforge/performance_transformers/_performances_transformers.py,sha256=ZjkFDXoEe5fURpN-dNkrgFXpHEg4aFCWdBDnPyLtgkM,18368
52
52
  spforge/ratings/__init__.py,sha256=OZVH2Lo6END3n1X8qi4QcyAPlThIwAYwVKCiIuOQSQU,576
53
53
  spforge/ratings/_base.py,sha256=ne4BRrYFPqMirdFPVnyDN44wjFQwOQgWoUXu_59xgWE,14687
54
- spforge/ratings/_player_rating.py,sha256=Ii1HNz0tC25dxZy9B2b_ULZVNwExNvdyIGw_1gRIeko,67163
54
+ spforge/ratings/_player_rating.py,sha256=KkTmKtacx-1cMuncWVIkoO-3srfEOAjx8o5prEeaAWQ,67811
55
55
  spforge/ratings/_team_rating.py,sha256=3m90-R2zW0k5EHwjw-83Hacz91fGmxW1LQ8ZUGHlgt4,24970
56
- spforge/ratings/enums.py,sha256=s7z_RcZS6Nlgfa_6tasO8_IABZJwywexe7sep9DJBgo,1739
56
+ spforge/ratings/enums.py,sha256=maG0X4WMQeMVAc2wbceq1an-U-z8moZGeG2BAgfICDA,1809
57
57
  spforge/ratings/league_identifier.py,sha256=_KDUKOwoNU6RNFKE5jju4eYFGVNGBdJsv5mhNvMakfc,6019
58
58
  spforge/ratings/league_start_rating_optimizer.py,sha256=Q4Vo3QT-r55qP4aD9WftsTB00UOSRvxM1khlyuAGWNM,8582
59
59
  spforge/ratings/player_performance_predictor.py,sha256=GtPpYlALgbQk8YHeaiRbpRvJHxeAhKpRxsaVUc9zR5o,7963
60
60
  spforge/ratings/start_rating_generator.py,sha256=eSasa5Oe9n4IoTGjFCYyFQAGrJtzrBW-Qor97lmaYuM,6776
61
61
  spforge/ratings/team_performance_predictor.py,sha256=ThQOmYQUqKBB46ONYHOMM2arXFH8AkyKpAZzs80SjHA,7217
62
62
  spforge/ratings/team_start_rating_generator.py,sha256=vK-_m8KwcHopchch_lKNHSGLiiNm5q9Lenm0d1cP_po,5110
63
- spforge/ratings/utils.py,sha256=_zFemqz2jJkH8rn2EZpDt8N6FELUmYp9qCnPzRtOIGU,4497
63
+ spforge/ratings/utils.py,sha256=WFxpiutHG9itJtjtagb26mjpHRjIhT7hopsiyetUgTg,4866
64
64
  spforge/scorer/__init__.py,sha256=wj8PCvYIl6742Xwmt86c3oy6iqE8Ss-OpwHud6kd9IY,256
65
65
  spforge/scorer/_score.py,sha256=DOl3wlHH0IlQelQA5CaNAfVtJhc544ZO5l-1mEno7nA,65276
66
66
  spforge/transformers/__init__.py,sha256=IPCsMcsgBqG52d0ttATLCY4HvFCQZddExlLt74U-zuI,390
@@ -71,7 +71,7 @@ spforge/transformers/_other_transformer.py,sha256=w2a7Wnki3vJe4GAkSa4kealw0GILIo
71
71
  spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
72
72
  spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
73
73
  spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
74
- spforge-0.8.33.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
74
+ spforge-0.8.35.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
75
75
  tests/test_autopipeline.py,sha256=7cNAn-nmGolfyfk3THh9IKcHZfRA-pLYC_xAyMg-No4,26863
76
76
  tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
77
77
  tests/test_feature_generator_pipeline.py,sha256=CK0zVL8PfTncy3RmG9i-YpgwjOIV7yJhV7Q44tbetI8,19020
@@ -92,9 +92,9 @@ tests/feature_generator/test_rolling_mean_days.py,sha256=EyOvdJDnmgPfe13uQBOkwo7
92
92
  tests/feature_generator/test_rolling_window.py,sha256=_o9oljcAIZ14iI7e8WFeAsfXxILnyqBffit21HOvII4,24378
93
93
  tests/hyperparameter_tuning/test_estimator_tuner.py,sha256=iewME41d6LR2aQ0OtohGFtN_ocJUwTeqvs6L0QDmfG4,4413
94
94
  tests/hyperparameter_tuning/test_rating_tuner.py,sha256=usjC2ioO_yWRjjNAlRTyMVYheOrCi0kKocmHQHdTmpM,18699
95
- tests/performance_transformers/test_performance_manager.py,sha256=gjuuV_hb27kCo_kUecPKG3Cbot2Gqis1W3kw2A4ovS4,10690
96
- tests/performance_transformers/test_performances_transformers.py,sha256=A-tGiCx7kXrj1cVj03Bc7prOeZ1_Ryz8YFx9uj3eK6w,11064
97
- tests/ratings/test_player_rating_generator.py,sha256=9iepzvjlAlye-CkrEX5GT6Pf4Bf4qi_uDwB6Wamo1JY,104641
95
+ tests/performance_transformers/test_performance_manager.py,sha256=fVXxSujE3OKE7tIRJjN5dWCLj9pkeXbuL6Zf0WrM0ZA,15698
96
+ tests/performance_transformers/test_performances_transformers.py,sha256=HnW7GKQ6B0ova6Zy0lKbEpA6peZGFE4oi9Th6r7RnG0,18949
97
+ tests/ratings/test_player_rating_generator.py,sha256=lFqFmEfy_sSyyeCmY0aCNaW3wj73ySVU3sp1O_m1os4,105713
98
98
  tests/ratings/test_player_rating_no_mutation.py,sha256=GzO3Hl__5K68DS3uRLefwnbcTJOvBM7cZqww4M21UZM,8493
99
99
  tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
100
100
  tests/ratings/test_team_rating_generator.py,sha256=SqQcfckNmJJc99feCdnmkNYDape-p69e92Dp8Vzpu2w,101156
@@ -108,7 +108,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
108
108
  tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
109
109
  tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
110
110
  tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
111
- spforge-0.8.33.dist-info/METADATA,sha256=LH22MH7XYeFTROBWPYVVHWyYRSwMYGozGMCtaJdkLgg,20048
112
- spforge-0.8.33.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
113
- spforge-0.8.33.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
114
- spforge-0.8.33.dist-info/RECORD,,
111
+ spforge-0.8.35.dist-info/METADATA,sha256=9ZQ0JmZkbQIGI48KUUlHw8jI8umvspKRztLv1E0EW60,20048
112
+ spforge-0.8.35.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
113
+ spforge-0.8.35.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
114
+ spforge-0.8.35.dist-info/RECORD,,
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  import polars as pl
5
5
  import pytest
6
6
 
7
- from spforge.performance_transformers import PerformanceWeightsManager
7
+ from spforge.performance_transformers import PerformanceWeightsManager, QuantilePerformanceScaler
8
8
  from spforge.performance_transformers._performance_manager import (
9
9
  ColumnWeight,
10
10
  PerformanceManager,
@@ -306,3 +306,131 @@ def test_factory_sets_transformer_features_to_prefixed_inputs_and_features_out_t
306
306
  if idx + 1 < len(ts):
307
307
  assert t.features_out == ts[idx + 1].features
308
308
  assert t.features == expected_in
309
+
310
+
311
+ class TestZeroInflationHandling:
312
+ @pytest.fixture
313
+ def zero_inflated_data(self):
314
+ """Create zero-inflated data with ~37.7% zeros."""
315
+ np.random.seed(42)
316
+ n = 1000
317
+ zeros = np.zeros(377)
318
+ nonzeros = np.random.exponential(scale=2, size=n - 377)
319
+ raw = np.concatenate([zeros, nonzeros])
320
+ np.random.shuffle(raw)
321
+ return raw
322
+
323
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
324
+ def test_performance_manager_detects_zero_inflation(self, frame, zero_inflated_data):
325
+ """Test that PerformanceManager auto-detects zero-inflated distributions."""
326
+ df = _make_native_df(frame, {"x": zero_inflated_data})
327
+
328
+ pm = PerformanceManager(
329
+ features=["x"],
330
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
331
+ prefix="performance__",
332
+ performance_column="perf",
333
+ zero_inflation_threshold=0.15,
334
+ )
335
+
336
+ pm.fit(df)
337
+
338
+ # Should have switched to quantile scaler
339
+ assert pm._using_quantile_scaler is True
340
+ assert isinstance(pm.transformers[-1], QuantilePerformanceScaler)
341
+
342
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
343
+ def test_performance_manager_uses_standard_pipeline_for_normal_data(self, frame):
344
+ """Test that PerformanceManager uses standard pipeline for non-zero-inflated data."""
345
+ np.random.seed(42)
346
+ # Normal distribution - no zero inflation
347
+ data = np.random.normal(loc=0.5, scale=0.1, size=1000)
348
+ df = _make_native_df(frame, {"x": data})
349
+
350
+ pm = PerformanceManager(
351
+ features=["x"],
352
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
353
+ prefix="performance__",
354
+ performance_column="perf",
355
+ zero_inflation_threshold=0.15,
356
+ )
357
+
358
+ pm.fit(df)
359
+
360
+ # Should NOT have switched to quantile scaler
361
+ assert pm._using_quantile_scaler is False
362
+ assert isinstance(pm.transformers[-1], MinMaxTransformer)
363
+
364
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
365
+ def test_zero_inflation_output_properties(self, frame, zero_inflated_data):
366
+ """Test that zero-inflated output has correct properties."""
367
+ df = _make_native_df(frame, {"x": zero_inflated_data})
368
+
369
+ pm = PerformanceManager(
370
+ features=["x"],
371
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
372
+ prefix="performance__",
373
+ performance_column="perf",
374
+ zero_inflation_threshold=0.15,
375
+ )
376
+
377
+ result = pm.fit_transform(df)
378
+ result_nw = nw.from_native(result)
379
+ scaled = result_nw["performance__perf"].to_numpy()
380
+
381
+ # 1. All zeros should have the same scaled value (the midpoint of zero mass)
382
+ is_zero = np.abs(zero_inflated_data) < 1e-10
383
+ zero_scaled_values = scaled[is_zero]
384
+ assert np.allclose(zero_scaled_values, zero_scaled_values[0], atol=1e-10)
385
+
386
+ # 2. Zeros should have lower values than non-zeros (on average)
387
+ is_nonzero = ~is_zero
388
+ assert np.mean(scaled[is_zero]) < np.mean(scaled[is_nonzero])
389
+
390
+ # 3. Mean should be approximately 0.5
391
+ assert abs(np.mean(scaled) - 0.5) < 0.02
392
+
393
+ # 4. Monotonicity preserved
394
+ order = np.argsort(zero_inflated_data)
395
+ sorted_scaled = scaled[order]
396
+ assert np.all(np.diff(sorted_scaled) >= -1e-10)
397
+
398
+ # 5. Bounded [0, 1] (with clipping tolerance)
399
+ assert np.all((scaled >= pm.min_value) & (scaled <= pm.max_value))
400
+
401
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
402
+ def test_disable_zero_inflation_detection(self, frame, zero_inflated_data):
403
+ """Test that zero_inflation_threshold=0 disables detection."""
404
+ df = _make_native_df(frame, {"x": zero_inflated_data})
405
+
406
+ pm = PerformanceManager(
407
+ features=["x"],
408
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
409
+ prefix="performance__",
410
+ performance_column="perf",
411
+ zero_inflation_threshold=0, # Disable detection
412
+ )
413
+
414
+ pm.fit(df)
415
+
416
+ # Should NOT have switched to quantile scaler
417
+ assert pm._using_quantile_scaler is False
418
+
419
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
420
+ def test_performance_weights_manager_zero_inflation(self, frame, zero_inflated_data):
421
+ """Test that PerformanceWeightsManager also handles zero inflation."""
422
+ df = _make_native_df(frame, {"feat_a": zero_inflated_data})
423
+
424
+ weights = [ColumnWeight(name="feat_a", weight=1.0)]
425
+ manager = PerformanceWeightsManager(
426
+ weights=weights,
427
+ # Use default transformers (None) to enable zero inflation detection
428
+ transformer_names=None,
429
+ prefix="",
430
+ zero_inflation_threshold=0.15,
431
+ )
432
+
433
+ manager.fit(df)
434
+
435
+ # Should have switched to quantile scaler
436
+ assert manager._using_quantile_scaler is True
@@ -1,3 +1,4 @@
1
+ import numpy as np
1
2
  import pandas as pd
2
3
  import polars as pl
3
4
  import pytest
@@ -6,6 +7,7 @@ from sklearn.linear_model import LinearRegression
6
7
 
7
8
  from spforge.performance_transformers import (
8
9
  DiminishingValueTransformer,
10
+ QuantilePerformanceScaler,
9
11
  SymmetricDistributionTransformer,
10
12
  )
11
13
  from spforge.performance_transformers._performances_transformers import (
@@ -355,3 +357,197 @@ def test_symmetric_distribution_transformer_with_granularity_fit_transform():
355
357
  abs(transformed_df.loc[lambda x: x.position == "SG"]["performance"].skew())
356
358
  < transformer.skewness_allowed
357
359
  )
360
+
361
+
362
+ class TestQuantilePerformanceScaler:
363
+ @pytest.fixture
364
+ def zero_inflated_data(self):
365
+ """Create zero-inflated data with ~37.7% zeros."""
366
+ np.random.seed(42)
367
+ n = 1000
368
+ # ~37.7% zeros
369
+ zeros = np.zeros(377)
370
+ # Non-zeros from exponential distribution
371
+ nonzeros = np.random.exponential(scale=2, size=n - 377)
372
+ raw = np.concatenate([zeros, nonzeros])
373
+ np.random.shuffle(raw)
374
+ return raw
375
+
376
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
377
+ def test_zeros_map_to_midpoint(self, df_type, zero_inflated_data):
378
+ """Test that zeros map to π/2 (midpoint of zero probability mass)."""
379
+ df = df_type({"performance": zero_inflated_data})
380
+
381
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
382
+ transformed = scaler.fit_transform(df)
383
+
384
+ if isinstance(transformed, pd.DataFrame):
385
+ scaled = transformed["performance"].values
386
+ else:
387
+ scaled = transformed["performance"].to_numpy()
388
+
389
+ pi = scaler._zero_proportion["performance"]
390
+ is_zero = np.abs(zero_inflated_data) < 1e-10
391
+
392
+ # Zeros should map to π/2
393
+ assert np.allclose(scaled[is_zero], pi / 2, atol=1e-10)
394
+
395
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
396
+ def test_mean_approximately_half(self, df_type, zero_inflated_data):
397
+ """Test that mean ≈ 0.5."""
398
+ df = df_type({"performance": zero_inflated_data})
399
+
400
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
401
+ transformed = scaler.fit_transform(df)
402
+
403
+ if isinstance(transformed, pd.DataFrame):
404
+ scaled = transformed["performance"].values
405
+ else:
406
+ scaled = transformed["performance"].to_numpy()
407
+
408
+ # Mean should be approximately 0.5
409
+ assert abs(np.mean(scaled) - 0.5) < 0.02
410
+
411
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
412
+ def test_monotonicity_preserved(self, df_type, zero_inflated_data):
413
+ """Test that monotonicity is preserved (sorted input → sorted output)."""
414
+ df = df_type({"performance": zero_inflated_data})
415
+
416
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
417
+ transformed = scaler.fit_transform(df)
418
+
419
+ if isinstance(transformed, pd.DataFrame):
420
+ scaled = transformed["performance"].values
421
+ else:
422
+ scaled = transformed["performance"].to_numpy()
423
+
424
+ # Check monotonicity: if we sort the raw data, the scaled values should also be sorted
425
+ order = np.argsort(zero_inflated_data)
426
+ sorted_scaled = scaled[order]
427
+ # Allow for tiny numerical errors
428
+ assert np.all(np.diff(sorted_scaled) >= -1e-10)
429
+
430
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
431
+ def test_bounded_zero_one(self, df_type, zero_inflated_data):
432
+ """Test that output is bounded [0, 1]."""
433
+ df = df_type({"performance": zero_inflated_data})
434
+
435
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
436
+ transformed = scaler.fit_transform(df)
437
+
438
+ if isinstance(transformed, pd.DataFrame):
439
+ scaled = transformed["performance"].values
440
+ else:
441
+ scaled = transformed["performance"].to_numpy()
442
+
443
+ assert np.all((scaled >= 0) & (scaled <= 1))
444
+
445
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
446
+ def test_nonzeros_span_pi_to_one(self, df_type, zero_inflated_data):
447
+ """Test that non-zeros map to range (π, 1)."""
448
+ df = df_type({"performance": zero_inflated_data})
449
+
450
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
451
+ transformed = scaler.fit_transform(df)
452
+
453
+ if isinstance(transformed, pd.DataFrame):
454
+ scaled = transformed["performance"].values
455
+ else:
456
+ scaled = transformed["performance"].to_numpy()
457
+
458
+ pi = scaler._zero_proportion["performance"]
459
+ is_nonzero = np.abs(zero_inflated_data) >= 1e-10
460
+
461
+ # Non-zeros should be >= π
462
+ assert np.all(scaled[is_nonzero] >= pi - 1e-10)
463
+ # Non-zeros should be <= 1
464
+ assert np.all(scaled[is_nonzero] <= 1 + 1e-10)
465
+
466
+ def test_with_prefix(self):
467
+ """Test that prefix is applied correctly."""
468
+ np.random.seed(42)
469
+ raw = np.concatenate([np.zeros(50), np.random.exponential(2, 50)])
470
+ df = pd.DataFrame({"feat": raw})
471
+
472
+ scaler = QuantilePerformanceScaler(features=["feat"], prefix="scaled_")
473
+ transformed = scaler.fit_transform(df)
474
+
475
+ assert "scaled_feat" in transformed.columns
476
+ assert scaler.features_out == ["scaled_feat"]
477
+
478
+ def test_multiple_features(self):
479
+ """Test that multiple features are handled correctly."""
480
+ np.random.seed(42)
481
+ raw_a = np.concatenate([np.zeros(50), np.random.exponential(2, 50)])
482
+ raw_b = np.concatenate([np.zeros(30), np.random.exponential(3, 70)])
483
+ df = pd.DataFrame({"a": raw_a, "b": raw_b})
484
+
485
+ scaler = QuantilePerformanceScaler(features=["a", "b"], prefix="")
486
+ transformed = scaler.fit_transform(df)
487
+
488
+ assert "a" in transformed.columns
489
+ assert "b" in transformed.columns
490
+
491
+ # Both should have mean ≈ 0.5
492
+ assert abs(transformed["a"].mean() - 0.5) < 0.05
493
+ assert abs(transformed["b"].mean() - 0.5) < 0.05
494
+
495
+ def test_all_zeros(self):
496
+ """Test edge case: all values are zero (π=1)."""
497
+ df = pd.DataFrame({"x": [0.0, 0.0, 0.0, 0.0, 0.0]})
498
+
499
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
500
+ transformed = scaler.fit_transform(df)
501
+
502
+ # π=1, so all values should map to π/2 = 0.5
503
+ assert np.allclose(transformed["x"].values, 0.5)
504
+ assert scaler._zero_proportion["x"] == 1.0
505
+
506
+ def test_no_zeros(self):
507
+ """Test edge case: no zeros (π=0)."""
508
+ np.random.seed(42)
509
+ df = pd.DataFrame({"x": np.random.exponential(2, 100) + 0.1}) # All positive
510
+
511
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
512
+ transformed = scaler.fit_transform(df)
513
+
514
+ # π=0, so values should span (0, 1) via quantiles
515
+ assert scaler._zero_proportion["x"] == 0.0
516
+ assert transformed["x"].min() >= 0
517
+ assert transformed["x"].max() <= 1
518
+ # Mean should still be ~0.5
519
+ assert abs(transformed["x"].mean() - 0.5) < 0.05
520
+
521
+ def test_nan_handling(self):
522
+ """Test that NaN values are preserved in output."""
523
+ df = pd.DataFrame({"x": [0.0, 1.0, np.nan, 2.0, 0.0, np.nan, 3.0]})
524
+
525
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
526
+ transformed = scaler.fit_transform(df)
527
+
528
+ # NaN positions should remain NaN
529
+ assert np.isnan(transformed["x"].iloc[2])
530
+ assert np.isnan(transformed["x"].iloc[5])
531
+
532
+ # Non-NaN values should be valid
533
+ non_nan_mask = ~np.isnan(transformed["x"].values)
534
+ assert np.all((transformed["x"].values[non_nan_mask] >= 0) &
535
+ (transformed["x"].values[non_nan_mask] <= 1))
536
+
537
+ def test_single_unique_nonzero(self):
538
+ """Test edge case: single unique non-zero value."""
539
+ df = pd.DataFrame({"x": [0.0, 0.0, 5.0, 5.0, 0.0, 5.0]})
540
+
541
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
542
+ transformed = scaler.fit_transform(df)
543
+
544
+ # Should still work - zeros map to π/2, non-zeros to (π, 1)
545
+ pi = scaler._zero_proportion["x"]
546
+ is_zero = df["x"] == 0
547
+
548
+ # Zeros should map to π/2
549
+ assert np.allclose(transformed["x"].values[is_zero.values], pi / 2)
550
+
551
+ # Non-zeros should all map to same value (since they're all equal)
552
+ nonzero_values = transformed["x"].values[~is_zero.values]
553
+ assert np.allclose(nonzero_values, nonzero_values[0])
@@ -1951,9 +1951,11 @@ def test_fit_transform__player_rating_difference_from_team_projected_feature(bas
1951
1951
  assert player_col in result.columns
1952
1952
  assert team_col in result.columns
1953
1953
 
1954
- for row in result.iter_rows(named=True):
1955
- expected = row[player_col] - row[team_col]
1956
- assert row[diff_col] == pytest.approx(expected, rel=1e-9)
1954
+ # Verify diff = player - team (vectorized)
1955
+ max_diff = result.select(
1956
+ (pl.col(diff_col) - (pl.col(player_col) - pl.col(team_col))).abs().max()
1957
+ ).item()
1958
+ assert max_diff < 1e-9
1957
1959
 
1958
1960
 
1959
1961
  def test_fit_transform__start_league_quantile_uses_existing_player_ratings(base_cn):
@@ -2909,3 +2911,30 @@ def test_fit_transform_null_perf_with_use_off_def_split_false__no_crash(base_cn)
2909
2911
  # TypeError: float() argument must be a string or a number, not 'NoneType'
2910
2912
  result = gen.fit_transform(df)
2911
2913
  assert result is not None
2914
+
2915
+
2916
+ def test_player_opponent_mean_projected_feature(base_cn, sample_df):
2917
+ """Test that PLAYER_OPPONENT_MEAN_PROJECTED outputs mean of player and opponent team ratings."""
2918
+ gen = PlayerRatingGenerator(
2919
+ performance_column="perf",
2920
+ column_names=base_cn,
2921
+ auto_scale_performance=True,
2922
+ features_out=[
2923
+ RatingKnownFeatures.PLAYER_RATING,
2924
+ RatingKnownFeatures.OPPONENT_RATING_PROJECTED,
2925
+ RatingKnownFeatures.PLAYER_OPPONENT_MEAN_PROJECTED,
2926
+ ],
2927
+ )
2928
+ result = gen.fit_transform(sample_df)
2929
+
2930
+ # Verify column exists
2931
+ assert "player_opponent_mean_projected_perf" in result.columns
2932
+
2933
+ # Verify it's the mean of player_rating and opponent_rating_projected (vectorized)
2934
+ expected = (
2935
+ pl.col("player_rating_perf") + pl.col("opponent_rating_projected_perf")
2936
+ ) / 2
2937
+ diff = result.select(
2938
+ (pl.col("player_opponent_mean_projected_perf") - expected).abs().max()
2939
+ ).item()
2940
+ assert diff < 1e-6, f"Max difference from expected mean: {diff}"