spforge 0.8.33__py3-none-any.whl → 0.8.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

@@ -6,5 +6,6 @@ from ._performances_transformers import (
6
6
  DiminishingValueTransformer as DiminishingValueTransformer,
7
7
  MinMaxTransformer as MinMaxTransformer,
8
8
  PartialStandardScaler as PartialStandardScaler,
9
+ QuantilePerformanceScaler as QuantilePerformanceScaler,
9
10
  SymmetricDistributionTransformer as SymmetricDistributionTransformer,
10
11
  )
@@ -4,6 +4,7 @@ from dataclasses import dataclass
4
4
  from typing import Literal
5
5
 
6
6
  import narwhals.stable.v2 as nw
7
+ import numpy as np
7
8
  from narwhals.typing import IntoFrameT
8
9
  from sklearn.base import BaseEstimator, TransformerMixin
9
10
 
@@ -11,6 +12,7 @@ from spforge.performance_transformers._performances_transformers import (
11
12
  MinMaxTransformer,
12
13
  NarwhalsFeatureTransformer,
13
14
  PartialStandardScaler,
15
+ QuantilePerformanceScaler,
14
16
  SymmetricDistributionTransformer,
15
17
  )
16
18
 
@@ -86,9 +88,12 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
86
88
  prefix: str = "performance__",
87
89
  min_value: float = -0.02,
88
90
  max_value: float = 1.02,
91
+ zero_inflation_threshold: float = 0.15,
89
92
  ):
90
93
  self.features = features
91
94
  self.prefix = prefix
95
+ # Store whether user explicitly disabled transformers (passed empty list)
96
+ self._user_disabled_transformers = transformer_names is not None and len(transformer_names) == 0
92
97
  self.transformer_names = transformer_names or [
93
98
  "symmetric",
94
99
  "partial_standard_scaler",
@@ -100,6 +105,7 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
100
105
  self.performance_column = self.prefix + performance_column
101
106
  self.min_value = min_value
102
107
  self.max_value = max_value
108
+ self.zero_inflation_threshold = zero_inflation_threshold
103
109
 
104
110
  self.transformers = create_performance_scalers_transformers(
105
111
  transformer_names=self.transformer_names,
@@ -107,9 +113,47 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
107
113
  features=self.features,
108
114
  prefix=self.prefix,
109
115
  )
116
+ self._using_quantile_scaler = False
110
117
 
111
118
  @nw.narwhalify
112
119
  def fit(self, df: IntoFrameT, y=None):
120
+ # Check for zero-inflated distributions and swap to quantile scaler if needed
121
+ # Only apply when user hasn't explicitly disabled transformers (passed empty list)
122
+ if self.zero_inflation_threshold > 0 and not self._user_disabled_transformers:
123
+ df = self._ensure_inputs_exist(df, self.transformers[0])
124
+ prefixed_features = [self.prefix + f for f in self.features]
125
+
126
+ for feature in prefixed_features:
127
+ if feature in df.columns:
128
+ values = df[feature].to_numpy()
129
+ values = values[np.isfinite(values)]
130
+
131
+ # Skip if binary/categorical data (few unique values)
132
+ # Quantile scaler is for continuous zero-inflated data, not binary outcomes
133
+ n_unique = len(np.unique(values))
134
+ if n_unique <= 3:
135
+ continue
136
+
137
+ zero_proportion = np.mean(np.abs(values) < 1e-10)
138
+
139
+ if zero_proportion > self.zero_inflation_threshold:
140
+ logging.info(
141
+ f"Detected zero-inflated distribution for {feature} "
142
+ f"({zero_proportion:.1%} zeros). Using QuantilePerformanceScaler."
143
+ )
144
+ self._using_quantile_scaler = True
145
+ # Use original_transformers (deepcopy made before standard transformers
146
+ # were appended to custom_transformers)
147
+ self.transformers = [
148
+ copy.deepcopy(t) for t in self.original_transformers
149
+ ] + [
150
+ QuantilePerformanceScaler(
151
+ features=prefixed_features,
152
+ prefix="",
153
+ )
154
+ ]
155
+ break
156
+
113
157
  for t in self.transformers:
114
158
  df = self._ensure_inputs_exist(df, t)
115
159
  t.fit(df)
@@ -169,6 +213,7 @@ class PerformanceWeightsManager(PerformanceManager):
169
213
  min_value: float = -0.02,
170
214
  prefix: str = "performance__",
171
215
  return_all_features: bool = False,
216
+ zero_inflation_threshold: float = 0.15,
172
217
  ):
173
218
  self.weights = weights
174
219
  self.return_all_features = return_all_features
@@ -181,6 +226,7 @@ class PerformanceWeightsManager(PerformanceManager):
181
226
  max_value=max_value,
182
227
  min_value=min_value,
183
228
  performance_column=performance_column,
229
+ zero_inflation_threshold=zero_inflation_threshold,
184
230
  )
185
231
 
186
232
  @nw.narwhalify
@@ -3,6 +3,7 @@ from typing import Literal, Protocol
3
3
 
4
4
  import narwhals
5
5
  import narwhals.stable.v2 as nw
6
+ import numpy as np
6
7
  from lightgbm import LGBMRegressor
7
8
  from narwhals.typing import IntoFrameT
8
9
  from sklearn.base import BaseEstimator, TransformerMixin
@@ -420,3 +421,82 @@ class GroupByTransformer(BaseEstimator, TransformerMixin):
420
421
  @nw.narwhalify
421
422
  def transform(self, df: IntoFrameT) -> IntoFrameT:
422
423
  return df.join(self._grouped, on=self.granularity, how="left").to_native()
424
+
425
+
426
+ class QuantilePerformanceScaler(BaseEstimator, TransformerMixin):
427
+ """
428
+ Quantile-based scaling for zero-inflated distributions.
429
+
430
+ Uses probability integral transform:
431
+ - Zeros → π/2 (midpoint of zero probability mass)
432
+ - Non-zeros → uniform on (π, 1) via empirical CDF
433
+
434
+ Fast: O(n log n) for fit, O(n) for transform.
435
+ """
436
+
437
+ def __init__(
438
+ self,
439
+ features: list[str],
440
+ zero_threshold: float = 1e-10,
441
+ n_quantiles: int = 1000,
442
+ prefix: str = "",
443
+ ):
444
+ self.features = features
445
+ self.zero_threshold = zero_threshold
446
+ self.n_quantiles = n_quantiles
447
+ self.prefix = prefix
448
+ self.features_out = [self.prefix + f for f in self.features]
449
+
450
+ self._zero_proportion: dict[str, float] = {}
451
+ self._nonzero_quantiles: dict[str, np.ndarray | None] = {}
452
+
453
+ @nw.narwhalify
454
+ def fit(self, df: IntoFrameT, y=None):
455
+ for feature in self.features:
456
+ values = df[feature].to_numpy()
457
+ values = values[np.isfinite(values)]
458
+
459
+ is_zero = np.abs(values) < self.zero_threshold
460
+ self._zero_proportion[feature] = np.mean(is_zero)
461
+
462
+ nonzero_values = values[~is_zero]
463
+ if len(nonzero_values) > 0:
464
+ percentiles = np.linspace(0, 100, self.n_quantiles + 1)
465
+ self._nonzero_quantiles[feature] = np.percentile(nonzero_values, percentiles)
466
+ else:
467
+ self._nonzero_quantiles[feature] = None
468
+ return self
469
+
470
+ @nw.narwhalify
471
+ def transform(self, df: IntoFrameT) -> IntoFrameT:
472
+ for feature in self.features:
473
+ out_feature = self.prefix + feature
474
+ values = df[feature].to_numpy()
475
+ result = np.full_like(values, np.nan, dtype=float)
476
+
477
+ # Handle NaN explicitly - preserve NaN in output
478
+ is_finite = np.isfinite(values)
479
+ is_zero = is_finite & (np.abs(values) < self.zero_threshold)
480
+ is_nonzero = is_finite & ~is_zero
481
+
482
+ pi = self._zero_proportion[feature]
483
+
484
+ # Zeros → midpoint of zero mass
485
+ result[is_zero] = pi / 2
486
+
487
+ # Non-zeros → interpolate to (π, 1)
488
+ nonzero_quantiles = self._nonzero_quantiles[feature]
489
+ if nonzero_quantiles is not None and np.any(is_nonzero):
490
+ nonzero_values = np.clip(
491
+ values[is_nonzero], nonzero_quantiles[0], nonzero_quantiles[-1]
492
+ )
493
+ ranks = np.interp(
494
+ nonzero_values,
495
+ nonzero_quantiles,
496
+ np.linspace(0, 1, len(nonzero_quantiles)),
497
+ )
498
+ result[is_nonzero] = pi + (1 - pi) * ranks
499
+
500
+ df = df.with_columns(**{out_feature: result})
501
+
502
+ return df.to_native()
spforge/ratings/_base.py CHANGED
@@ -156,17 +156,20 @@ class RatingGenerator(FeatureGenerator):
156
156
  )
157
157
 
158
158
  perf = df[self.performance_column]
159
- if perf.max() > 1.02 or perf.min() < -0.02:
160
- raise ValueError(
161
- f"Max {self.performance_column} must be less than than 1.02 and min value larger than -0.02. "
162
- "Either transform it manually or set auto_scale_performance to True"
163
- )
159
+ # Filter to finite values for validation (NaN/inf are treated as missing data)
160
+ finite_perf = perf.filter(perf.is_finite())
161
+ if len(finite_perf) > 0:
162
+ if finite_perf.max() > 1.02 or finite_perf.min() < -0.02:
163
+ raise ValueError(
164
+ f"Max {self.performance_column} must be less than than 1.02 and min value larger than -0.02. "
165
+ "Either transform it manually or set auto_scale_performance to True"
166
+ )
164
167
 
165
- if perf.mean() < 0.42 or perf.mean() > 0.58:
166
- raise ValueError(
167
- f"Mean {self.performance_column} must be between 0.42 and 0.58. "
168
- "Either transform it manually or set auto_scale_performance to True"
169
- )
168
+ if finite_perf.mean() < 0.42 or finite_perf.mean() > 0.58:
169
+ raise ValueError(
170
+ f"Mean {self.performance_column} must be between 0.42 and 0.58. "
171
+ "Either transform it manually or set auto_scale_performance to True"
172
+ )
170
173
 
171
174
  pl_df: pl.DataFrame
172
175
  pl_df = df.to_native() if df.implementation.is_polars() else df.to_polars().to_native()
@@ -29,6 +29,7 @@ from spforge.ratings._base import RatingGenerator, RatingKnownFeatures, RatingUn
29
29
  from spforge.ratings.start_rating_generator import StartRatingGenerator
30
30
  from spforge.ratings.utils import (
31
31
  add_opp_team_rating,
32
+ add_player_opponent_mean_projected,
32
33
  add_rating_difference_projected,
33
34
  add_rating_mean_projected,
34
35
  add_team_rating,
@@ -141,6 +142,9 @@ class PlayerRatingGenerator(RatingGenerator):
141
142
  self.PLAYER_DIFF_FROM_TEAM_PROJ_COL = self._suffix(
142
143
  str(RatingKnownFeatures.PLAYER_RATING_DIFFERENCE_FROM_TEAM_PROJECTED)
143
144
  )
145
+ self.PLAYER_OPP_MEAN_PROJ_COL = self._suffix(
146
+ str(RatingKnownFeatures.PLAYER_OPPONENT_MEAN_PROJECTED)
147
+ )
144
148
 
145
149
  self.TEAM_OFF_RATING_PROJ_COL = self._suffix(
146
150
  str(RatingKnownFeatures.TEAM_OFF_RATING_PROJECTED)
@@ -583,8 +587,9 @@ class PlayerRatingGenerator(RatingGenerator):
583
587
  )
584
588
 
585
589
  perf_value = pre_player.match_performance.performance_value
590
+ perf_is_valid = perf_value is not None and math.isfinite(float(perf_value))
586
591
 
587
- if perf_value is None:
592
+ if not perf_is_valid:
588
593
  off_change = 0.0
589
594
  else:
590
595
  off_perf = float(perf_value)
@@ -595,7 +600,7 @@ class PlayerRatingGenerator(RatingGenerator):
595
600
  * float(pre_player.match_performance.participation_weight)
596
601
  )
597
602
 
598
- if team1_def_perf is None or (not self.use_off_def_split and perf_value is None):
603
+ if team1_def_perf is None or (not self.use_off_def_split and not perf_is_valid):
599
604
  def_change = 0.0
600
605
  else:
601
606
  def_perf = float(team1_def_perf)
@@ -677,8 +682,9 @@ class PlayerRatingGenerator(RatingGenerator):
677
682
  )
678
683
 
679
684
  perf_value = pre_player.match_performance.performance_value
685
+ perf_is_valid = perf_value is not None and math.isfinite(float(perf_value))
680
686
 
681
- if perf_value is None:
687
+ if not perf_is_valid:
682
688
  off_change = 0.0
683
689
  else:
684
690
  off_perf = float(perf_value)
@@ -689,7 +695,7 @@ class PlayerRatingGenerator(RatingGenerator):
689
695
  * float(pre_player.match_performance.participation_weight)
690
696
  )
691
697
 
692
- if team2_def_perf is None or (not self.use_off_def_split and perf_value is None):
698
+ if team2_def_perf is None or (not self.use_off_def_split and not perf_is_valid):
693
699
  def_change = 0.0
694
700
  else:
695
701
  def_perf = float(team2_def_perf)
@@ -844,6 +850,7 @@ class PlayerRatingGenerator(RatingGenerator):
844
850
  or self.OPP_RATING_PROJ_COL in cols_to_add
845
851
  or self.DIFF_PROJ_COL in cols_to_add
846
852
  or self.PLAYER_DIFF_PROJ_COL in cols_to_add
853
+ or self.PLAYER_OPP_MEAN_PROJ_COL in cols_to_add
847
854
  ):
848
855
  df = add_team_rating_projected(
849
856
  df=df,
@@ -865,6 +872,7 @@ class PlayerRatingGenerator(RatingGenerator):
865
872
  or self.OPP_RATING_PROJ_COL in cols_to_add
866
873
  or self.DIFF_PROJ_COL in cols_to_add
867
874
  or self.PLAYER_DIFF_PROJ_COL in cols_to_add
875
+ or self.PLAYER_OPP_MEAN_PROJ_COL in cols_to_add
868
876
  ):
869
877
  df = add_opp_team_rating(
870
878
  df=df,
@@ -925,6 +933,15 @@ class PlayerRatingGenerator(RatingGenerator):
925
933
  rating_mean_out=self.MEAN_PROJ_COL,
926
934
  )
927
935
 
936
+ if self.PLAYER_OPP_MEAN_PROJ_COL in cols_to_add:
937
+ df = add_player_opponent_mean_projected(
938
+ df=df,
939
+ column_names=cn,
940
+ player_rating_col=self.PLAYER_RATING_COL,
941
+ opp_team_rating_col=self.OPP_RATING_PROJ_COL,
942
+ out_col=self.PLAYER_OPP_MEAN_PROJ_COL,
943
+ )
944
+
928
945
  if self.DIFF_COL in cols_to_add and self.DIFF_COL not in df.columns:
929
946
  if self.TEAM_RATING_COL not in df.columns:
930
947
  df = add_team_rating(
@@ -1239,7 +1256,7 @@ class PlayerRatingGenerator(RatingGenerator):
1239
1256
  self, c: PreMatchPlayersCollection
1240
1257
  ) -> float | None:
1241
1258
  # observed offense perf = weighted mean of player performance_value using participation_weight if present
1242
- # skip players with null performance
1259
+ # skip players with null/non-finite performance
1243
1260
  cn = self.column_names
1244
1261
  if not c.pre_match_player_ratings:
1245
1262
  return None
@@ -1249,12 +1266,15 @@ class PlayerRatingGenerator(RatingGenerator):
1249
1266
  perf_val = pre.match_performance.performance_value
1250
1267
  if perf_val is None:
1251
1268
  continue
1269
+ perf_float = float(perf_val)
1270
+ if not math.isfinite(perf_float):
1271
+ continue
1252
1272
  w = (
1253
1273
  float(pre.match_performance.participation_weight)
1254
1274
  if cn.participation_weight
1255
1275
  else 1.0
1256
1276
  )
1257
- psum += float(perf_val) * w
1277
+ psum += perf_float * w
1258
1278
  wsum += w
1259
1279
  return psum / wsum if wsum else None
1260
1280
 
@@ -1326,7 +1346,9 @@ class PlayerRatingGenerator(RatingGenerator):
1326
1346
  self.performance_column in team_player
1327
1347
  and team_player[self.performance_column] is not None
1328
1348
  ):
1329
- return float(team_player[self.performance_column])
1349
+ val = float(team_player[self.performance_column])
1350
+ if math.isfinite(val):
1351
+ return val
1330
1352
  return None
1331
1353
 
1332
1354
  def ensure_new_player(
spforge/ratings/enums.py CHANGED
@@ -17,6 +17,7 @@ class RatingKnownFeatures(StrEnum):
17
17
  PLAYER_RATING_DIFFERENCE_PROJECTED = "player_rating_difference_projected"
18
18
  TEAM_RATING_DIFFERENCE_PROJECTED = "team_rating_difference_projected"
19
19
  RATING_MEAN_PROJECTED = "rating_mean_projected"
20
+ PLAYER_OPPONENT_MEAN_PROJECTED = "player_opponent_mean_projected"
20
21
  TEAM_LEAGUE = "team_league"
21
22
  PLAYER_LEAGUE = "player_league"
22
23
  OPPONENT_LEAGUE = "opponent_league"
spforge/ratings/utils.py CHANGED
@@ -137,3 +137,16 @@ def add_rating_mean_projected(
137
137
  )
138
138
 
139
139
  return df.with_columns(pl.col(player_rating_col).mean().over(mid).alias(rating_mean_out))
140
+
141
+
142
+ def add_player_opponent_mean_projected(
143
+ df: pl.DataFrame,
144
+ column_names: ColumnNames,
145
+ player_rating_col: str,
146
+ opp_team_rating_col: str,
147
+ out_col: str,
148
+ ) -> pl.DataFrame:
149
+ """Mean of player rating and opponent team rating."""
150
+ return df.with_columns(
151
+ ((pl.col(player_rating_col) + pl.col(opp_team_rating_col)) / 2).alias(out_col)
152
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.33
3
+ Version: 0.8.36
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -46,21 +46,21 @@ spforge/feature_generator/_utils.py,sha256=KDn33ia1OYJTK8THFpvc_uRiH_Bl3fImGqqbf
46
46
  spforge/hyperparameter_tuning/__init__.py,sha256=Vcl8rVlJ7M708iPgqe4XxpZWgJKGux0Y5HgMCymRsHg,1099
47
47
  spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=SjwXLpvYIu_JY8uPRHeL5Kgp1aa0slWDz8qsKDaohWQ,8020
48
48
  spforge/hyperparameter_tuning/_tuner.py,sha256=M79q3saM6r0UZJsRUUgfdDr-3Qii-F2-wuSAZLFtZDo,19246
49
- spforge/performance_transformers/__init__.py,sha256=U6d7_kltbUMLYCGBk4QAFVPJTxXD3etD9qUftV-O3q4,422
50
- spforge/performance_transformers/_performance_manager.py,sha256=WmjmlMEnq7y75MiI_s9Y-9eMXIyhPTUKrwsXRtgYp0k,9620
51
- spforge/performance_transformers/_performances_transformers.py,sha256=0lxuWjAfWBRXRgQsNJHjw3P-nlTtHBu4_bOVdoy7hq4,15536
49
+ spforge/performance_transformers/__init__.py,sha256=J-5olqi1M_BUj3sN1NqAz9s28XAbuKK9M9xHq7IGlQU,482
50
+ spforge/performance_transformers/_performance_manager.py,sha256=tR_4laGoC_KFRaw3Gy0TMI-r5gnicDmvmxPEgAvl4E0,12031
51
+ spforge/performance_transformers/_performances_transformers.py,sha256=ZjkFDXoEe5fURpN-dNkrgFXpHEg4aFCWdBDnPyLtgkM,18368
52
52
  spforge/ratings/__init__.py,sha256=OZVH2Lo6END3n1X8qi4QcyAPlThIwAYwVKCiIuOQSQU,576
53
- spforge/ratings/_base.py,sha256=ne4BRrYFPqMirdFPVnyDN44wjFQwOQgWoUXu_59xgWE,14687
54
- spforge/ratings/_player_rating.py,sha256=Ii1HNz0tC25dxZy9B2b_ULZVNwExNvdyIGw_1gRIeko,67163
53
+ spforge/ratings/_base.py,sha256=Z-VVXWmnzR0O7o2_Q2x2ru_3uiTMpWqKDGQaNBJxCMA,14927
54
+ spforge/ratings/_player_rating.py,sha256=AIpDEl6cZaC3urcY-jFFgUWd4WZ71A33c5mOPfkXdMs,68178
55
55
  spforge/ratings/_team_rating.py,sha256=3m90-R2zW0k5EHwjw-83Hacz91fGmxW1LQ8ZUGHlgt4,24970
56
- spforge/ratings/enums.py,sha256=s7z_RcZS6Nlgfa_6tasO8_IABZJwywexe7sep9DJBgo,1739
56
+ spforge/ratings/enums.py,sha256=maG0X4WMQeMVAc2wbceq1an-U-z8moZGeG2BAgfICDA,1809
57
57
  spforge/ratings/league_identifier.py,sha256=_KDUKOwoNU6RNFKE5jju4eYFGVNGBdJsv5mhNvMakfc,6019
58
58
  spforge/ratings/league_start_rating_optimizer.py,sha256=Q4Vo3QT-r55qP4aD9WftsTB00UOSRvxM1khlyuAGWNM,8582
59
59
  spforge/ratings/player_performance_predictor.py,sha256=GtPpYlALgbQk8YHeaiRbpRvJHxeAhKpRxsaVUc9zR5o,7963
60
60
  spforge/ratings/start_rating_generator.py,sha256=eSasa5Oe9n4IoTGjFCYyFQAGrJtzrBW-Qor97lmaYuM,6776
61
61
  spforge/ratings/team_performance_predictor.py,sha256=ThQOmYQUqKBB46ONYHOMM2arXFH8AkyKpAZzs80SjHA,7217
62
62
  spforge/ratings/team_start_rating_generator.py,sha256=vK-_m8KwcHopchch_lKNHSGLiiNm5q9Lenm0d1cP_po,5110
63
- spforge/ratings/utils.py,sha256=_zFemqz2jJkH8rn2EZpDt8N6FELUmYp9qCnPzRtOIGU,4497
63
+ spforge/ratings/utils.py,sha256=WFxpiutHG9itJtjtagb26mjpHRjIhT7hopsiyetUgTg,4866
64
64
  spforge/scorer/__init__.py,sha256=wj8PCvYIl6742Xwmt86c3oy6iqE8Ss-OpwHud6kd9IY,256
65
65
  spforge/scorer/_score.py,sha256=DOl3wlHH0IlQelQA5CaNAfVtJhc544ZO5l-1mEno7nA,65276
66
66
  spforge/transformers/__init__.py,sha256=IPCsMcsgBqG52d0ttATLCY4HvFCQZddExlLt74U-zuI,390
@@ -71,7 +71,7 @@ spforge/transformers/_other_transformer.py,sha256=w2a7Wnki3vJe4GAkSa4kealw0GILIo
71
71
  spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
72
72
  spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
73
73
  spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
74
- spforge-0.8.33.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
74
+ spforge-0.8.36.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
75
75
  tests/test_autopipeline.py,sha256=7cNAn-nmGolfyfk3THh9IKcHZfRA-pLYC_xAyMg-No4,26863
76
76
  tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
77
77
  tests/test_feature_generator_pipeline.py,sha256=CK0zVL8PfTncy3RmG9i-YpgwjOIV7yJhV7Q44tbetI8,19020
@@ -92,9 +92,9 @@ tests/feature_generator/test_rolling_mean_days.py,sha256=EyOvdJDnmgPfe13uQBOkwo7
92
92
  tests/feature_generator/test_rolling_window.py,sha256=_o9oljcAIZ14iI7e8WFeAsfXxILnyqBffit21HOvII4,24378
93
93
  tests/hyperparameter_tuning/test_estimator_tuner.py,sha256=iewME41d6LR2aQ0OtohGFtN_ocJUwTeqvs6L0QDmfG4,4413
94
94
  tests/hyperparameter_tuning/test_rating_tuner.py,sha256=usjC2ioO_yWRjjNAlRTyMVYheOrCi0kKocmHQHdTmpM,18699
95
- tests/performance_transformers/test_performance_manager.py,sha256=gjuuV_hb27kCo_kUecPKG3Cbot2Gqis1W3kw2A4ovS4,10690
96
- tests/performance_transformers/test_performances_transformers.py,sha256=A-tGiCx7kXrj1cVj03Bc7prOeZ1_Ryz8YFx9uj3eK6w,11064
97
- tests/ratings/test_player_rating_generator.py,sha256=9iepzvjlAlye-CkrEX5GT6Pf4Bf4qi_uDwB6Wamo1JY,104641
95
+ tests/performance_transformers/test_performance_manager.py,sha256=fVXxSujE3OKE7tIRJjN5dWCLj9pkeXbuL6Zf0WrM0ZA,15698
96
+ tests/performance_transformers/test_performances_transformers.py,sha256=HnW7GKQ6B0ova6Zy0lKbEpA6peZGFE4oi9Th6r7RnG0,18949
97
+ tests/ratings/test_player_rating_generator.py,sha256=1Pkx0H8xJMTeLc2Fu9zJcoDpBWiY2zCVSxuBFJk2uEs,110717
98
98
  tests/ratings/test_player_rating_no_mutation.py,sha256=GzO3Hl__5K68DS3uRLefwnbcTJOvBM7cZqww4M21UZM,8493
99
99
  tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
100
100
  tests/ratings/test_team_rating_generator.py,sha256=SqQcfckNmJJc99feCdnmkNYDape-p69e92Dp8Vzpu2w,101156
@@ -108,7 +108,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
108
108
  tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
109
109
  tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
110
110
  tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
111
- spforge-0.8.33.dist-info/METADATA,sha256=LH22MH7XYeFTROBWPYVVHWyYRSwMYGozGMCtaJdkLgg,20048
112
- spforge-0.8.33.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
113
- spforge-0.8.33.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
114
- spforge-0.8.33.dist-info/RECORD,,
111
+ spforge-0.8.36.dist-info/METADATA,sha256=HxggFJqUQNu2SdjRwCHclWb3_5t1z5Ensjg1AXiVtXU,20048
112
+ spforge-0.8.36.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
113
+ spforge-0.8.36.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
114
+ spforge-0.8.36.dist-info/RECORD,,
@@ -4,7 +4,7 @@ import pandas as pd
4
4
  import polars as pl
5
5
  import pytest
6
6
 
7
- from spforge.performance_transformers import PerformanceWeightsManager
7
+ from spforge.performance_transformers import PerformanceWeightsManager, QuantilePerformanceScaler
8
8
  from spforge.performance_transformers._performance_manager import (
9
9
  ColumnWeight,
10
10
  PerformanceManager,
@@ -306,3 +306,131 @@ def test_factory_sets_transformer_features_to_prefixed_inputs_and_features_out_t
306
306
  if idx + 1 < len(ts):
307
307
  assert t.features_out == ts[idx + 1].features
308
308
  assert t.features == expected_in
309
+
310
+
311
+ class TestZeroInflationHandling:
312
+ @pytest.fixture
313
+ def zero_inflated_data(self):
314
+ """Create zero-inflated data with ~37.7% zeros."""
315
+ np.random.seed(42)
316
+ n = 1000
317
+ zeros = np.zeros(377)
318
+ nonzeros = np.random.exponential(scale=2, size=n - 377)
319
+ raw = np.concatenate([zeros, nonzeros])
320
+ np.random.shuffle(raw)
321
+ return raw
322
+
323
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
324
+ def test_performance_manager_detects_zero_inflation(self, frame, zero_inflated_data):
325
+ """Test that PerformanceManager auto-detects zero-inflated distributions."""
326
+ df = _make_native_df(frame, {"x": zero_inflated_data})
327
+
328
+ pm = PerformanceManager(
329
+ features=["x"],
330
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
331
+ prefix="performance__",
332
+ performance_column="perf",
333
+ zero_inflation_threshold=0.15,
334
+ )
335
+
336
+ pm.fit(df)
337
+
338
+ # Should have switched to quantile scaler
339
+ assert pm._using_quantile_scaler is True
340
+ assert isinstance(pm.transformers[-1], QuantilePerformanceScaler)
341
+
342
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
343
+ def test_performance_manager_uses_standard_pipeline_for_normal_data(self, frame):
344
+ """Test that PerformanceManager uses standard pipeline for non-zero-inflated data."""
345
+ np.random.seed(42)
346
+ # Normal distribution - no zero inflation
347
+ data = np.random.normal(loc=0.5, scale=0.1, size=1000)
348
+ df = _make_native_df(frame, {"x": data})
349
+
350
+ pm = PerformanceManager(
351
+ features=["x"],
352
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
353
+ prefix="performance__",
354
+ performance_column="perf",
355
+ zero_inflation_threshold=0.15,
356
+ )
357
+
358
+ pm.fit(df)
359
+
360
+ # Should NOT have switched to quantile scaler
361
+ assert pm._using_quantile_scaler is False
362
+ assert isinstance(pm.transformers[-1], MinMaxTransformer)
363
+
364
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
365
+ def test_zero_inflation_output_properties(self, frame, zero_inflated_data):
366
+ """Test that zero-inflated output has correct properties."""
367
+ df = _make_native_df(frame, {"x": zero_inflated_data})
368
+
369
+ pm = PerformanceManager(
370
+ features=["x"],
371
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
372
+ prefix="performance__",
373
+ performance_column="perf",
374
+ zero_inflation_threshold=0.15,
375
+ )
376
+
377
+ result = pm.fit_transform(df)
378
+ result_nw = nw.from_native(result)
379
+ scaled = result_nw["performance__perf"].to_numpy()
380
+
381
+ # 1. All zeros should have the same scaled value (the midpoint of zero mass)
382
+ is_zero = np.abs(zero_inflated_data) < 1e-10
383
+ zero_scaled_values = scaled[is_zero]
384
+ assert np.allclose(zero_scaled_values, zero_scaled_values[0], atol=1e-10)
385
+
386
+ # 2. Zeros should have lower values than non-zeros (on average)
387
+ is_nonzero = ~is_zero
388
+ assert np.mean(scaled[is_zero]) < np.mean(scaled[is_nonzero])
389
+
390
+ # 3. Mean should be approximately 0.5
391
+ assert abs(np.mean(scaled) - 0.5) < 0.02
392
+
393
+ # 4. Monotonicity preserved
394
+ order = np.argsort(zero_inflated_data)
395
+ sorted_scaled = scaled[order]
396
+ assert np.all(np.diff(sorted_scaled) >= -1e-10)
397
+
398
+ # 5. Bounded [0, 1] (with clipping tolerance)
399
+ assert np.all((scaled >= pm.min_value) & (scaled <= pm.max_value))
400
+
401
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
402
+ def test_disable_zero_inflation_detection(self, frame, zero_inflated_data):
403
+ """Test that zero_inflation_threshold=0 disables detection."""
404
+ df = _make_native_df(frame, {"x": zero_inflated_data})
405
+
406
+ pm = PerformanceManager(
407
+ features=["x"],
408
+ transformer_names=["symmetric", "partial_standard_scaler", "min_max"],
409
+ prefix="performance__",
410
+ performance_column="perf",
411
+ zero_inflation_threshold=0, # Disable detection
412
+ )
413
+
414
+ pm.fit(df)
415
+
416
+ # Should NOT have switched to quantile scaler
417
+ assert pm._using_quantile_scaler is False
418
+
419
+ @pytest.mark.parametrize("frame", ["pd", "pl"])
420
+ def test_performance_weights_manager_zero_inflation(self, frame, zero_inflated_data):
421
+ """Test that PerformanceWeightsManager also handles zero inflation."""
422
+ df = _make_native_df(frame, {"feat_a": zero_inflated_data})
423
+
424
+ weights = [ColumnWeight(name="feat_a", weight=1.0)]
425
+ manager = PerformanceWeightsManager(
426
+ weights=weights,
427
+ # Use default transformers (None) to enable zero inflation detection
428
+ transformer_names=None,
429
+ prefix="",
430
+ zero_inflation_threshold=0.15,
431
+ )
432
+
433
+ manager.fit(df)
434
+
435
+ # Should have switched to quantile scaler
436
+ assert manager._using_quantile_scaler is True
@@ -1,3 +1,4 @@
1
+ import numpy as np
1
2
  import pandas as pd
2
3
  import polars as pl
3
4
  import pytest
@@ -6,6 +7,7 @@ from sklearn.linear_model import LinearRegression
6
7
 
7
8
  from spforge.performance_transformers import (
8
9
  DiminishingValueTransformer,
10
+ QuantilePerformanceScaler,
9
11
  SymmetricDistributionTransformer,
10
12
  )
11
13
  from spforge.performance_transformers._performances_transformers import (
@@ -355,3 +357,197 @@ def test_symmetric_distribution_transformer_with_granularity_fit_transform():
355
357
  abs(transformed_df.loc[lambda x: x.position == "SG"]["performance"].skew())
356
358
  < transformer.skewness_allowed
357
359
  )
360
+
361
+
362
+ class TestQuantilePerformanceScaler:
363
+ @pytest.fixture
364
+ def zero_inflated_data(self):
365
+ """Create zero-inflated data with ~37.7% zeros."""
366
+ np.random.seed(42)
367
+ n = 1000
368
+ # ~37.7% zeros
369
+ zeros = np.zeros(377)
370
+ # Non-zeros from exponential distribution
371
+ nonzeros = np.random.exponential(scale=2, size=n - 377)
372
+ raw = np.concatenate([zeros, nonzeros])
373
+ np.random.shuffle(raw)
374
+ return raw
375
+
376
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
377
+ def test_zeros_map_to_midpoint(self, df_type, zero_inflated_data):
378
+ """Test that zeros map to π/2 (midpoint of zero probability mass)."""
379
+ df = df_type({"performance": zero_inflated_data})
380
+
381
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
382
+ transformed = scaler.fit_transform(df)
383
+
384
+ if isinstance(transformed, pd.DataFrame):
385
+ scaled = transformed["performance"].values
386
+ else:
387
+ scaled = transformed["performance"].to_numpy()
388
+
389
+ pi = scaler._zero_proportion["performance"]
390
+ is_zero = np.abs(zero_inflated_data) < 1e-10
391
+
392
+ # Zeros should map to π/2
393
+ assert np.allclose(scaled[is_zero], pi / 2, atol=1e-10)
394
+
395
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
396
+ def test_mean_approximately_half(self, df_type, zero_inflated_data):
397
+ """Test that mean ≈ 0.5."""
398
+ df = df_type({"performance": zero_inflated_data})
399
+
400
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
401
+ transformed = scaler.fit_transform(df)
402
+
403
+ if isinstance(transformed, pd.DataFrame):
404
+ scaled = transformed["performance"].values
405
+ else:
406
+ scaled = transformed["performance"].to_numpy()
407
+
408
+ # Mean should be approximately 0.5
409
+ assert abs(np.mean(scaled) - 0.5) < 0.02
410
+
411
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
412
+ def test_monotonicity_preserved(self, df_type, zero_inflated_data):
413
+ """Test that monotonicity is preserved (sorted input → sorted output)."""
414
+ df = df_type({"performance": zero_inflated_data})
415
+
416
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
417
+ transformed = scaler.fit_transform(df)
418
+
419
+ if isinstance(transformed, pd.DataFrame):
420
+ scaled = transformed["performance"].values
421
+ else:
422
+ scaled = transformed["performance"].to_numpy()
423
+
424
+ # Check monotonicity: if we sort the raw data, the scaled values should also be sorted
425
+ order = np.argsort(zero_inflated_data)
426
+ sorted_scaled = scaled[order]
427
+ # Allow for tiny numerical errors
428
+ assert np.all(np.diff(sorted_scaled) >= -1e-10)
429
+
430
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
431
+ def test_bounded_zero_one(self, df_type, zero_inflated_data):
432
+ """Test that output is bounded [0, 1]."""
433
+ df = df_type({"performance": zero_inflated_data})
434
+
435
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
436
+ transformed = scaler.fit_transform(df)
437
+
438
+ if isinstance(transformed, pd.DataFrame):
439
+ scaled = transformed["performance"].values
440
+ else:
441
+ scaled = transformed["performance"].to_numpy()
442
+
443
+ assert np.all((scaled >= 0) & (scaled <= 1))
444
+
445
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
446
+ def test_nonzeros_span_pi_to_one(self, df_type, zero_inflated_data):
447
+ """Test that non-zeros map to range (π, 1)."""
448
+ df = df_type({"performance": zero_inflated_data})
449
+
450
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
451
+ transformed = scaler.fit_transform(df)
452
+
453
+ if isinstance(transformed, pd.DataFrame):
454
+ scaled = transformed["performance"].values
455
+ else:
456
+ scaled = transformed["performance"].to_numpy()
457
+
458
+ pi = scaler._zero_proportion["performance"]
459
+ is_nonzero = np.abs(zero_inflated_data) >= 1e-10
460
+
461
+ # Non-zeros should be >= π
462
+ assert np.all(scaled[is_nonzero] >= pi - 1e-10)
463
+ # Non-zeros should be <= 1
464
+ assert np.all(scaled[is_nonzero] <= 1 + 1e-10)
465
+
466
+ def test_with_prefix(self):
467
+ """Test that prefix is applied correctly."""
468
+ np.random.seed(42)
469
+ raw = np.concatenate([np.zeros(50), np.random.exponential(2, 50)])
470
+ df = pd.DataFrame({"feat": raw})
471
+
472
+ scaler = QuantilePerformanceScaler(features=["feat"], prefix="scaled_")
473
+ transformed = scaler.fit_transform(df)
474
+
475
+ assert "scaled_feat" in transformed.columns
476
+ assert scaler.features_out == ["scaled_feat"]
477
+
478
+ def test_multiple_features(self):
479
+ """Test that multiple features are handled correctly."""
480
+ np.random.seed(42)
481
+ raw_a = np.concatenate([np.zeros(50), np.random.exponential(2, 50)])
482
+ raw_b = np.concatenate([np.zeros(30), np.random.exponential(3, 70)])
483
+ df = pd.DataFrame({"a": raw_a, "b": raw_b})
484
+
485
+ scaler = QuantilePerformanceScaler(features=["a", "b"], prefix="")
486
+ transformed = scaler.fit_transform(df)
487
+
488
+ assert "a" in transformed.columns
489
+ assert "b" in transformed.columns
490
+
491
+ # Both should have mean ≈ 0.5
492
+ assert abs(transformed["a"].mean() - 0.5) < 0.05
493
+ assert abs(transformed["b"].mean() - 0.5) < 0.05
494
+
495
+ def test_all_zeros(self):
496
+ """Test edge case: all values are zero (π=1)."""
497
+ df = pd.DataFrame({"x": [0.0, 0.0, 0.0, 0.0, 0.0]})
498
+
499
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
500
+ transformed = scaler.fit_transform(df)
501
+
502
+ # π=1, so all values should map to π/2 = 0.5
503
+ assert np.allclose(transformed["x"].values, 0.5)
504
+ assert scaler._zero_proportion["x"] == 1.0
505
+
506
+ def test_no_zeros(self):
507
+ """Test edge case: no zeros (π=0)."""
508
+ np.random.seed(42)
509
+ df = pd.DataFrame({"x": np.random.exponential(2, 100) + 0.1}) # All positive
510
+
511
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
512
+ transformed = scaler.fit_transform(df)
513
+
514
+ # π=0, so values should span (0, 1) via quantiles
515
+ assert scaler._zero_proportion["x"] == 0.0
516
+ assert transformed["x"].min() >= 0
517
+ assert transformed["x"].max() <= 1
518
+ # Mean should still be ~0.5
519
+ assert abs(transformed["x"].mean() - 0.5) < 0.05
520
+
521
+ def test_nan_handling(self):
522
+ """Test that NaN values are preserved in output."""
523
+ df = pd.DataFrame({"x": [0.0, 1.0, np.nan, 2.0, 0.0, np.nan, 3.0]})
524
+
525
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
526
+ transformed = scaler.fit_transform(df)
527
+
528
+ # NaN positions should remain NaN
529
+ assert np.isnan(transformed["x"].iloc[2])
530
+ assert np.isnan(transformed["x"].iloc[5])
531
+
532
+ # Non-NaN values should be valid
533
+ non_nan_mask = ~np.isnan(transformed["x"].values)
534
+ assert np.all((transformed["x"].values[non_nan_mask] >= 0) &
535
+ (transformed["x"].values[non_nan_mask] <= 1))
536
+
537
+ def test_single_unique_nonzero(self):
538
+ """Test edge case: single unique non-zero value."""
539
+ df = pd.DataFrame({"x": [0.0, 0.0, 5.0, 5.0, 0.0, 5.0]})
540
+
541
+ scaler = QuantilePerformanceScaler(features=["x"], prefix="")
542
+ transformed = scaler.fit_transform(df)
543
+
544
+ # Should still work - zeros map to π/2, non-zeros to (π, 1)
545
+ pi = scaler._zero_proportion["x"]
546
+ is_zero = df["x"] == 0
547
+
548
+ # Zeros should map to π/2
549
+ assert np.allclose(transformed["x"].values[is_zero.values], pi / 2)
550
+
551
+ # Non-zeros should all map to same value (since they're all equal)
552
+ nonzero_values = transformed["x"].values[~is_zero.values]
553
+ assert np.allclose(nonzero_values, nonzero_values[0])
@@ -1951,9 +1951,11 @@ def test_fit_transform__player_rating_difference_from_team_projected_feature(bas
1951
1951
  assert player_col in result.columns
1952
1952
  assert team_col in result.columns
1953
1953
 
1954
- for row in result.iter_rows(named=True):
1955
- expected = row[player_col] - row[team_col]
1956
- assert row[diff_col] == pytest.approx(expected, rel=1e-9)
1954
+ # Verify diff = player - team (vectorized)
1955
+ max_diff = result.select(
1956
+ (pl.col(diff_col) - (pl.col(player_col) - pl.col(team_col))).abs().max()
1957
+ ).item()
1958
+ assert max_diff < 1e-9
1957
1959
 
1958
1960
 
1959
1961
  def test_fit_transform__start_league_quantile_uses_existing_player_ratings(base_cn):
@@ -2909,3 +2911,168 @@ def test_fit_transform_null_perf_with_use_off_def_split_false__no_crash(base_cn)
2909
2911
  # TypeError: float() argument must be a string or a number, not 'NoneType'
2910
2912
  result = gen.fit_transform(df)
2911
2913
  assert result is not None
2914
+
2915
+
2916
+ def test_player_opponent_mean_projected_feature(base_cn, sample_df):
2917
+ """Test that PLAYER_OPPONENT_MEAN_PROJECTED outputs mean of player and opponent team ratings."""
2918
+ gen = PlayerRatingGenerator(
2919
+ performance_column="perf",
2920
+ column_names=base_cn,
2921
+ auto_scale_performance=True,
2922
+ features_out=[
2923
+ RatingKnownFeatures.PLAYER_RATING,
2924
+ RatingKnownFeatures.OPPONENT_RATING_PROJECTED,
2925
+ RatingKnownFeatures.PLAYER_OPPONENT_MEAN_PROJECTED,
2926
+ ],
2927
+ )
2928
+ result = gen.fit_transform(sample_df)
2929
+
2930
+ # Verify column exists
2931
+ assert "player_opponent_mean_projected_perf" in result.columns
2932
+
2933
+ # Verify it's the mean of player_rating and opponent_rating_projected (vectorized)
2934
+ expected = (
2935
+ pl.col("player_rating_perf") + pl.col("opponent_rating_projected_perf")
2936
+ ) / 2
2937
+ diff = result.select(
2938
+ (pl.col("player_opponent_mean_projected_perf") - expected).abs().max()
2939
+ ).item()
2940
+ assert diff < 1e-6, f"Max difference from expected mean: {diff}"
2941
+
2942
+
2943
+ class TestNaNPerformanceHandling:
2944
+ """Tests that PlayerRatingGenerator handles NaN performance values correctly."""
2945
+
2946
+ @pytest.fixture
2947
+ def nan_cn(self):
2948
+ return ColumnNames(
2949
+ player_id="player_id",
2950
+ team_id="team_id",
2951
+ match_id="match_id",
2952
+ start_date="start_date",
2953
+ participation_weight="participation_weight",
2954
+ )
2955
+
2956
+ def _create_test_df(self, performance_values: list) -> pl.DataFrame:
2957
+ """Create minimal test DataFrame with 2 teams, 2 players each."""
2958
+ import numpy as np
2959
+
2960
+ return pl.DataFrame({
2961
+ "match_id": ["game1"] * 4,
2962
+ "player_id": ["p1", "p2", "p3", "p4"],
2963
+ "team_id": ["A", "A", "B", "B"],
2964
+ "start_date": ["2024-01-01"] * 4,
2965
+ "performance": performance_values,
2966
+ "participation_weight": [1.0] * 4,
2967
+ })
2968
+
2969
+ def test_nan_performance_does_not_raise(self, nan_cn):
2970
+ """NaN performance values should not raise ValueError."""
2971
+ import numpy as np
2972
+
2973
+ # Use values that give mean ~0.5 when NaN is excluded
2974
+ df = self._create_test_df([0.6, np.nan, 0.4, 0.5])
2975
+
2976
+ gen = PlayerRatingGenerator(
2977
+ performance_column="performance",
2978
+ column_names=nan_cn,
2979
+ features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
2980
+ )
2981
+
2982
+ # Should not raise
2983
+ result = gen.fit_transform(df)
2984
+ assert len(result) == 4
2985
+
2986
+ def test_inf_performance_does_not_raise(self, nan_cn):
2987
+ """Inf performance values should not raise ValueError."""
2988
+ # Use values that give mean ~0.5 when inf is excluded
2989
+ df = self._create_test_df([0.6, float('inf'), 0.4, 0.5])
2990
+
2991
+ gen = PlayerRatingGenerator(
2992
+ performance_column="performance",
2993
+ column_names=nan_cn,
2994
+ features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
2995
+ )
2996
+
2997
+ result = gen.fit_transform(df)
2998
+ assert len(result) == 4
2999
+
3000
+ def test_neg_inf_performance_does_not_raise(self, nan_cn):
3001
+ """Negative inf performance values should not raise ValueError."""
3002
+ # Use values that give mean ~0.5 when -inf is excluded
3003
+ df = self._create_test_df([0.6, float('-inf'), 0.4, 0.5])
3004
+
3005
+ gen = PlayerRatingGenerator(
3006
+ performance_column="performance",
3007
+ column_names=nan_cn,
3008
+ features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
3009
+ )
3010
+
3011
+ result = gen.fit_transform(df)
3012
+ assert len(result) == 4
3013
+
3014
+ def test_nan_performance_treated_as_zero_rating_change(self, nan_cn):
3015
+ """Players with NaN performance should have zero rating change."""
3016
+ import numpy as np
3017
+
3018
+ # Two games: first establishes ratings, second tests NaN handling
3019
+ df = pl.DataFrame({
3020
+ "match_id": ["game1"] * 4 + ["game2"] * 4,
3021
+ "player_id": ["p1", "p2", "p3", "p4"] * 2,
3022
+ "team_id": ["A", "A", "B", "B"] * 2,
3023
+ "start_date": ["2024-01-01"] * 4 + ["2024-01-02"] * 4,
3024
+ "performance": [0.5, 0.5, 0.5, 0.5, 0.6, np.nan, 0.4, 0.5],
3025
+ "participation_weight": [1.0] * 8,
3026
+ })
3027
+
3028
+ gen = PlayerRatingGenerator(
3029
+ performance_column="performance",
3030
+ column_names=nan_cn,
3031
+ features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
3032
+ )
3033
+
3034
+ result = gen.fit_transform(df)
3035
+
3036
+ # Get player p2's ratings for both games
3037
+ p2_game1 = result.filter(
3038
+ (pl.col("player_id") == "p2") & (pl.col("match_id") == "game1")
3039
+ )["player_off_rating_performance"][0]
3040
+
3041
+ p2_game2 = result.filter(
3042
+ (pl.col("player_id") == "p2") & (pl.col("match_id") == "game2")
3043
+ )["player_off_rating_performance"][0]
3044
+
3045
+ # Rating should not change when performance is NaN
3046
+ assert p2_game1 == p2_game2, "NaN performance should result in zero rating change"
3047
+
3048
+ def test_all_nan_performance_in_match_handled(self, nan_cn):
3049
+ """Match where all players have NaN should not raise."""
3050
+ import numpy as np
3051
+
3052
+ # All NaN - validation is skipped when no finite values exist
3053
+ df = self._create_test_df([np.nan, np.nan, np.nan, np.nan])
3054
+
3055
+ gen = PlayerRatingGenerator(
3056
+ performance_column="performance",
3057
+ column_names=nan_cn,
3058
+ features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
3059
+ )
3060
+
3061
+ result = gen.fit_transform(df)
3062
+ assert len(result) == 4
3063
+
3064
+ def test_mixed_nan_none_performance(self, nan_cn):
3065
+ """Mix of NaN and None performance values should both be handled."""
3066
+ import numpy as np
3067
+
3068
+ # Use values that give mean ~0.5 when NaN/None are excluded
3069
+ df = self._create_test_df([0.6, np.nan, None, 0.5])
3070
+
3071
+ gen = PlayerRatingGenerator(
3072
+ performance_column="performance",
3073
+ column_names=nan_cn,
3074
+ features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
3075
+ )
3076
+
3077
+ result = gen.fit_transform(df)
3078
+ assert len(result) == 4