spforge 0.8.37__py3-none-any.whl → 0.8.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spforge/performance_transformers/_performance_manager.py +5 -0
- spforge/performance_transformers/_performances_transformers.py +72 -6
- {spforge-0.8.37.dist-info → spforge-0.8.38.dist-info}/METADATA +1 -1
- {spforge-0.8.37.dist-info → spforge-0.8.38.dist-info}/RECORD +9 -9
- tests/performance_transformers/test_performance_manager.py +140 -0
- tests/performance_transformers/test_performances_transformers.py +211 -0
- {spforge-0.8.37.dist-info → spforge-0.8.38.dist-info}/WHEEL +0 -0
- {spforge-0.8.37.dist-info → spforge-0.8.38.dist-info}/licenses/LICENSE +0 -0
- {spforge-0.8.37.dist-info → spforge-0.8.38.dist-info}/top_level.txt +0 -0
|
@@ -89,6 +89,7 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
|
|
|
89
89
|
min_value: float = 0.0,
|
|
90
90
|
max_value: float = 1.0,
|
|
91
91
|
zero_inflation_threshold: float = 0.15,
|
|
92
|
+
quantile_weight_column: str | None = None,
|
|
92
93
|
):
|
|
93
94
|
self.features = features
|
|
94
95
|
self.prefix = prefix
|
|
@@ -106,6 +107,7 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
|
|
|
106
107
|
self.min_value = min_value
|
|
107
108
|
self.max_value = max_value
|
|
108
109
|
self.zero_inflation_threshold = zero_inflation_threshold
|
|
110
|
+
self.quantile_weight_column = quantile_weight_column
|
|
109
111
|
|
|
110
112
|
self.transformers = create_performance_scalers_transformers(
|
|
111
113
|
transformer_names=self.transformer_names,
|
|
@@ -150,6 +152,7 @@ class PerformanceManager(BaseEstimator, TransformerMixin):
|
|
|
150
152
|
QuantilePerformanceScaler(
|
|
151
153
|
features=prefixed_features,
|
|
152
154
|
prefix="",
|
|
155
|
+
weight_column=self.quantile_weight_column,
|
|
153
156
|
)
|
|
154
157
|
]
|
|
155
158
|
break
|
|
@@ -214,6 +217,7 @@ class PerformanceWeightsManager(PerformanceManager):
|
|
|
214
217
|
prefix: str = "performance__",
|
|
215
218
|
return_all_features: bool = False,
|
|
216
219
|
zero_inflation_threshold: float = 0.15,
|
|
220
|
+
quantile_weight_column: str | None = None,
|
|
217
221
|
):
|
|
218
222
|
self.weights = weights
|
|
219
223
|
self.return_all_features = return_all_features
|
|
@@ -227,6 +231,7 @@ class PerformanceWeightsManager(PerformanceManager):
|
|
|
227
231
|
min_value=min_value,
|
|
228
232
|
performance_column=performance_column,
|
|
229
233
|
zero_inflation_threshold=zero_inflation_threshold,
|
|
234
|
+
quantile_weight_column=quantile_weight_column,
|
|
230
235
|
)
|
|
231
236
|
|
|
232
237
|
@nw.narwhalify
|
|
@@ -432,6 +432,9 @@ class QuantilePerformanceScaler(BaseEstimator, TransformerMixin):
|
|
|
432
432
|
- Non-zeros → uniform on (π, 1) via empirical CDF
|
|
433
433
|
|
|
434
434
|
Fast: O(n log n) for fit, O(n) for transform.
|
|
435
|
+
|
|
436
|
+
If weight_column is provided, weighted quantiles are computed so that
|
|
437
|
+
the scaling respects participation weights (e.g., minutes played).
|
|
435
438
|
"""
|
|
436
439
|
|
|
437
440
|
def __init__(
|
|
@@ -440,11 +443,13 @@ class QuantilePerformanceScaler(BaseEstimator, TransformerMixin):
|
|
|
440
443
|
zero_threshold: float = 1e-10,
|
|
441
444
|
n_quantiles: int = 1000,
|
|
442
445
|
prefix: str = "",
|
|
446
|
+
weight_column: str | None = None,
|
|
443
447
|
):
|
|
444
448
|
self.features = features
|
|
445
449
|
self.zero_threshold = zero_threshold
|
|
446
450
|
self.n_quantiles = n_quantiles
|
|
447
451
|
self.prefix = prefix
|
|
452
|
+
self.weight_column = weight_column
|
|
448
453
|
self.features_out = [self.prefix + f for f in self.features]
|
|
449
454
|
|
|
450
455
|
self._zero_proportion: dict[str, float] = {}
|
|
@@ -452,21 +457,82 @@ class QuantilePerformanceScaler(BaseEstimator, TransformerMixin):
|
|
|
452
457
|
|
|
453
458
|
@nw.narwhalify
|
|
454
459
|
def fit(self, df: IntoFrameT, y=None):
|
|
460
|
+
# Get weights if specified
|
|
461
|
+
weights = None
|
|
462
|
+
if self.weight_column is not None:
|
|
463
|
+
weights = df[self.weight_column].to_numpy()
|
|
464
|
+
|
|
455
465
|
for feature in self.features:
|
|
456
466
|
values = df[feature].to_numpy()
|
|
457
|
-
values = values[np.isfinite(values)]
|
|
458
467
|
|
|
459
|
-
|
|
460
|
-
|
|
468
|
+
# Create finite mask
|
|
469
|
+
finite_mask = np.isfinite(values)
|
|
470
|
+
if weights is not None:
|
|
471
|
+
# Also require finite, positive weights
|
|
472
|
+
weight_valid = np.isfinite(weights) & (weights > 0)
|
|
473
|
+
finite_mask = finite_mask & weight_valid
|
|
474
|
+
|
|
475
|
+
values_finite = values[finite_mask]
|
|
476
|
+
|
|
477
|
+
if weights is not None:
|
|
478
|
+
weights_finite = weights[finite_mask]
|
|
479
|
+
else:
|
|
480
|
+
weights_finite = None
|
|
481
|
+
|
|
482
|
+
is_zero = np.abs(values_finite) < self.zero_threshold
|
|
483
|
+
|
|
484
|
+
if weights_finite is not None:
|
|
485
|
+
# Weighted zero proportion: sum(weights where zero) / sum(weights)
|
|
486
|
+
total_weight = np.sum(weights_finite)
|
|
487
|
+
if total_weight > 0:
|
|
488
|
+
self._zero_proportion[feature] = np.sum(weights_finite[is_zero]) / total_weight
|
|
489
|
+
else:
|
|
490
|
+
self._zero_proportion[feature] = np.mean(is_zero)
|
|
491
|
+
else:
|
|
492
|
+
self._zero_proportion[feature] = np.mean(is_zero)
|
|
493
|
+
|
|
494
|
+
nonzero_mask = ~is_zero
|
|
495
|
+
nonzero_values = values_finite[nonzero_mask]
|
|
461
496
|
|
|
462
|
-
nonzero_values = values[~is_zero]
|
|
463
497
|
if len(nonzero_values) > 0:
|
|
464
|
-
|
|
465
|
-
|
|
498
|
+
if weights_finite is not None:
|
|
499
|
+
# Weighted quantiles using interpolation on weighted CDF
|
|
500
|
+
nonzero_weights = weights_finite[nonzero_mask]
|
|
501
|
+
self._nonzero_quantiles[feature] = self._compute_weighted_quantiles(
|
|
502
|
+
nonzero_values, nonzero_weights
|
|
503
|
+
)
|
|
504
|
+
else:
|
|
505
|
+
percentiles = np.linspace(0, 100, self.n_quantiles + 1)
|
|
506
|
+
self._nonzero_quantiles[feature] = np.percentile(nonzero_values, percentiles)
|
|
466
507
|
else:
|
|
467
508
|
self._nonzero_quantiles[feature] = None
|
|
468
509
|
return self
|
|
469
510
|
|
|
511
|
+
def _compute_weighted_quantiles(
|
|
512
|
+
self, values: np.ndarray, weights: np.ndarray
|
|
513
|
+
) -> np.ndarray:
|
|
514
|
+
"""Compute weighted quantiles using weighted CDF interpolation."""
|
|
515
|
+
# Sort by value
|
|
516
|
+
order = np.argsort(values)
|
|
517
|
+
sorted_values = values[order]
|
|
518
|
+
sorted_weights = weights[order]
|
|
519
|
+
|
|
520
|
+
# Compute weighted CDF
|
|
521
|
+
cumulative_weights = np.cumsum(sorted_weights)
|
|
522
|
+
total_weight = cumulative_weights[-1]
|
|
523
|
+
|
|
524
|
+
# Normalize CDF to [0, 1]
|
|
525
|
+
cdf = cumulative_weights / total_weight
|
|
526
|
+
|
|
527
|
+
# Sample quantiles at evenly spaced CDF positions
|
|
528
|
+
target_cdf = np.linspace(0, 1, self.n_quantiles + 1)
|
|
529
|
+
|
|
530
|
+
# Interpolate to get quantile values
|
|
531
|
+
# Use np.interp which handles edge cases gracefully
|
|
532
|
+
quantiles = np.interp(target_cdf, cdf, sorted_values)
|
|
533
|
+
|
|
534
|
+
return quantiles
|
|
535
|
+
|
|
470
536
|
@nw.narwhalify
|
|
471
537
|
def transform(self, df: IntoFrameT) -> IntoFrameT:
|
|
472
538
|
for feature in self.features:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: spforge
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.38
|
|
4
4
|
Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
|
|
5
5
|
Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
|
|
6
6
|
License: See LICENSE file
|
|
@@ -47,8 +47,8 @@ spforge/hyperparameter_tuning/__init__.py,sha256=Vcl8rVlJ7M708iPgqe4XxpZWgJKGux0
|
|
|
47
47
|
spforge/hyperparameter_tuning/_default_search_spaces.py,sha256=SjwXLpvYIu_JY8uPRHeL5Kgp1aa0slWDz8qsKDaohWQ,8020
|
|
48
48
|
spforge/hyperparameter_tuning/_tuner.py,sha256=M79q3saM6r0UZJsRUUgfdDr-3Qii-F2-wuSAZLFtZDo,19246
|
|
49
49
|
spforge/performance_transformers/__init__.py,sha256=J-5olqi1M_BUj3sN1NqAz9s28XAbuKK9M9xHq7IGlQU,482
|
|
50
|
-
spforge/performance_transformers/_performance_manager.py,sha256=
|
|
51
|
-
spforge/performance_transformers/_performances_transformers.py,sha256=
|
|
50
|
+
spforge/performance_transformers/_performance_manager.py,sha256=lh7enqYLd1lXj1VTOiK5N880xkil5q1jRsM51fe_K5g,12322
|
|
51
|
+
spforge/performance_transformers/_performances_transformers.py,sha256=nmjJTEH86JjFneWsnSWIYnUXQoUDskOraDO3VtuufIY,20931
|
|
52
52
|
spforge/ratings/__init__.py,sha256=OZVH2Lo6END3n1X8qi4QcyAPlThIwAYwVKCiIuOQSQU,576
|
|
53
53
|
spforge/ratings/_base.py,sha256=Z-VVXWmnzR0O7o2_Q2x2ru_3uiTMpWqKDGQaNBJxCMA,14927
|
|
54
54
|
spforge/ratings/_player_rating.py,sha256=AIpDEl6cZaC3urcY-jFFgUWd4WZ71A33c5mOPfkXdMs,68178
|
|
@@ -71,7 +71,7 @@ spforge/transformers/_other_transformer.py,sha256=w2a7Wnki3vJe4GAkSa4kealw0GILIo
|
|
|
71
71
|
spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
|
|
72
72
|
spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
|
|
73
73
|
spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
|
|
74
|
-
spforge-0.8.
|
|
74
|
+
spforge-0.8.38.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
75
75
|
tests/test_autopipeline.py,sha256=7cNAn-nmGolfyfk3THh9IKcHZfRA-pLYC_xAyMg-No4,26863
|
|
76
76
|
tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
|
|
77
77
|
tests/test_feature_generator_pipeline.py,sha256=CK0zVL8PfTncy3RmG9i-YpgwjOIV7yJhV7Q44tbetI8,19020
|
|
@@ -92,8 +92,8 @@ tests/feature_generator/test_rolling_mean_days.py,sha256=EyOvdJDnmgPfe13uQBOkwo7
|
|
|
92
92
|
tests/feature_generator/test_rolling_window.py,sha256=_o9oljcAIZ14iI7e8WFeAsfXxILnyqBffit21HOvII4,24378
|
|
93
93
|
tests/hyperparameter_tuning/test_estimator_tuner.py,sha256=iewME41d6LR2aQ0OtohGFtN_ocJUwTeqvs6L0QDmfG4,4413
|
|
94
94
|
tests/hyperparameter_tuning/test_rating_tuner.py,sha256=usjC2ioO_yWRjjNAlRTyMVYheOrCi0kKocmHQHdTmpM,18699
|
|
95
|
-
tests/performance_transformers/test_performance_manager.py,sha256=
|
|
96
|
-
tests/performance_transformers/test_performances_transformers.py,sha256=
|
|
95
|
+
tests/performance_transformers/test_performance_manager.py,sha256=tHyyyjAotJLtrViWx3j0DaNDqp3nowooMu5Wop7DjBE,24766
|
|
96
|
+
tests/performance_transformers/test_performances_transformers.py,sha256=QyLNzis7yOBsjjclhuYrbZFaSmlTcSAbGVzNvK1B-SU,27817
|
|
97
97
|
tests/ratings/test_player_rating_generator.py,sha256=1Pkx0H8xJMTeLc2Fu9zJcoDpBWiY2zCVSxuBFJk2uEs,110717
|
|
98
98
|
tests/ratings/test_player_rating_no_mutation.py,sha256=GzO3Hl__5K68DS3uRLefwnbcTJOvBM7cZqww4M21UZM,8493
|
|
99
99
|
tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
|
|
@@ -108,7 +108,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
108
108
|
tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
|
|
109
109
|
tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
|
|
110
110
|
tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
|
|
111
|
-
spforge-0.8.
|
|
112
|
-
spforge-0.8.
|
|
113
|
-
spforge-0.8.
|
|
114
|
-
spforge-0.8.
|
|
111
|
+
spforge-0.8.38.dist-info/METADATA,sha256=XXk1_WwD1gWvzRk08OSagsR6_w0qJAjcWX57-fwL9rg,20048
|
|
112
|
+
spforge-0.8.38.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
113
|
+
spforge-0.8.38.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
|
|
114
|
+
spforge-0.8.38.dist-info/RECORD,,
|
|
@@ -436,6 +436,146 @@ class TestZeroInflationHandling:
|
|
|
436
436
|
assert manager._using_quantile_scaler is True
|
|
437
437
|
|
|
438
438
|
|
|
439
|
+
class TestWeightedQuantileScaling:
|
|
440
|
+
"""Tests for weighted quantile scaling in PerformanceManager."""
|
|
441
|
+
|
|
442
|
+
@pytest.fixture
|
|
443
|
+
def weighted_zero_inflated_data(self):
|
|
444
|
+
"""Create zero-inflated data where high-weight rows have higher non-zero rate."""
|
|
445
|
+
np.random.seed(42)
|
|
446
|
+
n = 1000
|
|
447
|
+
|
|
448
|
+
# Create weights (e.g., minutes played)
|
|
449
|
+
weights = np.random.exponential(scale=20, size=n) + 1
|
|
450
|
+
|
|
451
|
+
# High-weight rows have lower zero probability
|
|
452
|
+
values = []
|
|
453
|
+
for w in weights:
|
|
454
|
+
zero_prob = 0.6 - 0.4 * (w / weights.max())
|
|
455
|
+
if np.random.random() < zero_prob:
|
|
456
|
+
values.append(0.0)
|
|
457
|
+
else:
|
|
458
|
+
values.append(np.random.exponential(scale=2))
|
|
459
|
+
|
|
460
|
+
return np.array(values), weights
|
|
461
|
+
|
|
462
|
+
@pytest.mark.parametrize("frame", ["pd", "pl"])
|
|
463
|
+
def test_performance_manager_with_weight_column(self, frame, weighted_zero_inflated_data):
|
|
464
|
+
"""Test that PerformanceManager passes weight column to QuantilePerformanceScaler."""
|
|
465
|
+
values, weights = weighted_zero_inflated_data
|
|
466
|
+
df = _make_native_df(frame, {"x": values, "minutes": weights})
|
|
467
|
+
|
|
468
|
+
pm = PerformanceManager(
|
|
469
|
+
features=["x"],
|
|
470
|
+
transformer_names=None, # Use defaults, auto-detect zero inflation
|
|
471
|
+
prefix="performance__",
|
|
472
|
+
performance_column="perf",
|
|
473
|
+
zero_inflation_threshold=0.15,
|
|
474
|
+
quantile_weight_column="minutes",
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
pm.fit(df)
|
|
478
|
+
|
|
479
|
+
# Should have switched to quantile scaler
|
|
480
|
+
assert pm._using_quantile_scaler is True
|
|
481
|
+
assert isinstance(pm.transformers[-1], QuantilePerformanceScaler)
|
|
482
|
+
# And should have the weight column set
|
|
483
|
+
assert pm.transformers[-1].weight_column == "minutes"
|
|
484
|
+
|
|
485
|
+
@pytest.mark.parametrize("frame", ["pd", "pl"])
|
|
486
|
+
def test_weighted_scaling_reduces_weighted_bias(self, frame, weighted_zero_inflated_data):
|
|
487
|
+
"""Test that weighted scaling produces weighted mean closer to 0.5."""
|
|
488
|
+
values, weights = weighted_zero_inflated_data
|
|
489
|
+
df = _make_native_df(frame, {"x": values, "minutes": weights})
|
|
490
|
+
|
|
491
|
+
# With weighted scaling
|
|
492
|
+
pm_weighted = PerformanceManager(
|
|
493
|
+
features=["x"],
|
|
494
|
+
transformer_names=None,
|
|
495
|
+
prefix="performance__",
|
|
496
|
+
performance_column="perf",
|
|
497
|
+
zero_inflation_threshold=0.15,
|
|
498
|
+
quantile_weight_column="minutes",
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
result_weighted = pm_weighted.fit_transform(df)
|
|
502
|
+
result_weighted_nw = nw.from_native(result_weighted)
|
|
503
|
+
scaled_weighted = result_weighted_nw["performance__perf"].to_numpy()
|
|
504
|
+
|
|
505
|
+
# Without weighted scaling
|
|
506
|
+
pm_unweighted = PerformanceManager(
|
|
507
|
+
features=["x"],
|
|
508
|
+
transformer_names=None,
|
|
509
|
+
prefix="performance__",
|
|
510
|
+
performance_column="perf",
|
|
511
|
+
zero_inflation_threshold=0.15,
|
|
512
|
+
quantile_weight_column=None, # No weighting
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
result_unweighted = pm_unweighted.fit_transform(df)
|
|
516
|
+
result_unweighted_nw = nw.from_native(result_unweighted)
|
|
517
|
+
scaled_unweighted = result_unweighted_nw["performance__perf"].to_numpy()
|
|
518
|
+
|
|
519
|
+
# Compute weighted means
|
|
520
|
+
weighted_mean_of_weighted = np.average(scaled_weighted, weights=weights)
|
|
521
|
+
weighted_mean_of_unweighted = np.average(scaled_unweighted, weights=weights)
|
|
522
|
+
|
|
523
|
+
# Weighted scaling should have weighted mean closer to 0.5
|
|
524
|
+
assert abs(weighted_mean_of_weighted - 0.5) < abs(weighted_mean_of_unweighted - 0.5), (
|
|
525
|
+
f"Weighted mean with weighted scaling ({weighted_mean_of_weighted:.4f}) "
|
|
526
|
+
f"should be closer to 0.5 than without ({weighted_mean_of_unweighted:.4f})"
|
|
527
|
+
)
|
|
528
|
+
|
|
529
|
+
@pytest.mark.parametrize("frame", ["pd", "pl"])
|
|
530
|
+
def test_performance_weights_manager_with_quantile_weight_column(
|
|
531
|
+
self, frame, weighted_zero_inflated_data
|
|
532
|
+
):
|
|
533
|
+
"""Test that PerformanceWeightsManager also supports quantile_weight_column."""
|
|
534
|
+
from spforge.performance_transformers._performance_manager import ColumnWeight
|
|
535
|
+
|
|
536
|
+
values, weights = weighted_zero_inflated_data
|
|
537
|
+
df = _make_native_df(frame, {"feat_a": values, "minutes": weights})
|
|
538
|
+
|
|
539
|
+
column_weights = [ColumnWeight(name="feat_a", weight=1.0)]
|
|
540
|
+
manager = PerformanceWeightsManager(
|
|
541
|
+
weights=column_weights,
|
|
542
|
+
transformer_names=None,
|
|
543
|
+
prefix="",
|
|
544
|
+
zero_inflation_threshold=0.15,
|
|
545
|
+
quantile_weight_column="minutes",
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
manager.fit(df)
|
|
549
|
+
|
|
550
|
+
# Should have switched to quantile scaler with weight column
|
|
551
|
+
assert manager._using_quantile_scaler is True
|
|
552
|
+
assert manager.transformers[-1].weight_column == "minutes"
|
|
553
|
+
|
|
554
|
+
@pytest.mark.parametrize("frame", ["pd", "pl"])
|
|
555
|
+
def test_weight_column_not_used_when_no_zero_inflation(self, frame):
|
|
556
|
+
"""Test that weight column is not needed when zero inflation is not detected."""
|
|
557
|
+
np.random.seed(42)
|
|
558
|
+
# Normal distribution - no zero inflation
|
|
559
|
+
data = np.random.normal(loc=0.5, scale=0.1, size=1000)
|
|
560
|
+
weights = np.random.exponential(scale=20, size=1000) + 1
|
|
561
|
+
|
|
562
|
+
df = _make_native_df(frame, {"x": data, "minutes": weights})
|
|
563
|
+
|
|
564
|
+
pm = PerformanceManager(
|
|
565
|
+
features=["x"],
|
|
566
|
+
transformer_names=None,
|
|
567
|
+
prefix="performance__",
|
|
568
|
+
performance_column="perf",
|
|
569
|
+
zero_inflation_threshold=0.15,
|
|
570
|
+
quantile_weight_column="minutes",
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
pm.fit(df)
|
|
574
|
+
|
|
575
|
+
# Should NOT have switched to quantile scaler
|
|
576
|
+
assert pm._using_quantile_scaler is False
|
|
577
|
+
|
|
578
|
+
|
|
439
579
|
class TestAutoScalePerformanceBounds:
|
|
440
580
|
"""Tests for ensuring scaled performance stays within [0, 1] bounds."""
|
|
441
581
|
|
|
@@ -551,3 +551,214 @@ class TestQuantilePerformanceScaler:
|
|
|
551
551
|
# Non-zeros should all map to same value (since they're all equal)
|
|
552
552
|
nonzero_values = transformed["x"].values[~is_zero.values]
|
|
553
553
|
assert np.allclose(nonzero_values, nonzero_values[0])
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
class TestWeightedQuantilePerformanceScaler:
|
|
557
|
+
"""Tests for weighted quantile scaling."""
|
|
558
|
+
|
|
559
|
+
@pytest.fixture
|
|
560
|
+
def weighted_zero_inflated_data(self):
|
|
561
|
+
"""Create zero-inflated data where high-weight rows have higher non-zero rate."""
|
|
562
|
+
np.random.seed(42)
|
|
563
|
+
n = 1000
|
|
564
|
+
|
|
565
|
+
# Create weights (e.g., minutes played)
|
|
566
|
+
weights = np.random.exponential(scale=20, size=n) + 1 # 1 to ~100
|
|
567
|
+
|
|
568
|
+
# High-weight rows have lower zero probability
|
|
569
|
+
# This simulates: players with more minutes are more likely to have non-zero stats
|
|
570
|
+
values = []
|
|
571
|
+
for w in weights:
|
|
572
|
+
# Zero probability decreases as weight increases
|
|
573
|
+
zero_prob = 0.6 - 0.4 * (w / weights.max()) # 0.2 to 0.6
|
|
574
|
+
if np.random.random() < zero_prob:
|
|
575
|
+
values.append(0.0)
|
|
576
|
+
else:
|
|
577
|
+
values.append(np.random.exponential(scale=2))
|
|
578
|
+
|
|
579
|
+
return np.array(values), weights
|
|
580
|
+
|
|
581
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
582
|
+
def test_weighted_mean_alignment(self, df_type, weighted_zero_inflated_data):
|
|
583
|
+
"""Test that weighted mean of scaled values is closer to 0.5 with weighted scaling."""
|
|
584
|
+
values, weights = weighted_zero_inflated_data
|
|
585
|
+
df = df_type({"performance": values, "weight": weights})
|
|
586
|
+
|
|
587
|
+
# Weighted scaler
|
|
588
|
+
weighted_scaler = QuantilePerformanceScaler(
|
|
589
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
590
|
+
)
|
|
591
|
+
weighted_transformed = weighted_scaler.fit_transform(df)
|
|
592
|
+
|
|
593
|
+
if isinstance(weighted_transformed, pd.DataFrame):
|
|
594
|
+
weighted_scaled = weighted_transformed["performance"].values
|
|
595
|
+
else:
|
|
596
|
+
weighted_scaled = weighted_transformed["performance"].to_numpy()
|
|
597
|
+
|
|
598
|
+
# Compute weighted mean
|
|
599
|
+
weighted_mean = np.average(weighted_scaled, weights=weights)
|
|
600
|
+
|
|
601
|
+
# Weighted scaling should have weighted mean close to 0.5
|
|
602
|
+
assert abs(weighted_mean - 0.5) < 0.02, (
|
|
603
|
+
f"Weighted mean should be close to 0.5, got {weighted_mean}"
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
607
|
+
def test_backward_compatibility_without_weights(self, df_type):
|
|
608
|
+
"""Test that weight_column=None matches original unweighted behavior."""
|
|
609
|
+
np.random.seed(42)
|
|
610
|
+
n = 500
|
|
611
|
+
zeros = np.zeros(200)
|
|
612
|
+
nonzeros = np.random.exponential(scale=2, size=n - 200)
|
|
613
|
+
raw = np.concatenate([zeros, nonzeros])
|
|
614
|
+
np.random.shuffle(raw)
|
|
615
|
+
|
|
616
|
+
df = df_type({"performance": raw})
|
|
617
|
+
|
|
618
|
+
# Unweighted scaler (explicitly None)
|
|
619
|
+
unweighted_scaler = QuantilePerformanceScaler(
|
|
620
|
+
features=["performance"], prefix="", weight_column=None
|
|
621
|
+
)
|
|
622
|
+
unweighted_result = unweighted_scaler.fit_transform(df)
|
|
623
|
+
|
|
624
|
+
# Scaler without weight_column argument
|
|
625
|
+
default_scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
|
|
626
|
+
default_result = default_scaler.fit_transform(df)
|
|
627
|
+
|
|
628
|
+
if isinstance(unweighted_result, pd.DataFrame):
|
|
629
|
+
unweighted_values = unweighted_result["performance"].values
|
|
630
|
+
default_values = default_result["performance"].values
|
|
631
|
+
else:
|
|
632
|
+
unweighted_values = unweighted_result["performance"].to_numpy()
|
|
633
|
+
default_values = default_result["performance"].to_numpy()
|
|
634
|
+
|
|
635
|
+
# Results should be identical
|
|
636
|
+
assert np.allclose(unweighted_values, default_values, atol=1e-10)
|
|
637
|
+
|
|
638
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
639
|
+
def test_equal_weights_matches_unweighted(self, df_type):
|
|
640
|
+
"""Test that all equal weights produce same result as unweighted."""
|
|
641
|
+
np.random.seed(42)
|
|
642
|
+
n = 500
|
|
643
|
+
zeros = np.zeros(200)
|
|
644
|
+
nonzeros = np.random.exponential(scale=2, size=n - 200)
|
|
645
|
+
raw = np.concatenate([zeros, nonzeros])
|
|
646
|
+
np.random.shuffle(raw)
|
|
647
|
+
equal_weights = np.ones(n)
|
|
648
|
+
|
|
649
|
+
df = df_type({"performance": raw, "weight": equal_weights})
|
|
650
|
+
|
|
651
|
+
# Weighted scaler with equal weights
|
|
652
|
+
weighted_scaler = QuantilePerformanceScaler(
|
|
653
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
654
|
+
)
|
|
655
|
+
weighted_result = weighted_scaler.fit_transform(df)
|
|
656
|
+
|
|
657
|
+
# Unweighted scaler
|
|
658
|
+
unweighted_scaler = QuantilePerformanceScaler(
|
|
659
|
+
features=["performance"], prefix="", weight_column=None
|
|
660
|
+
)
|
|
661
|
+
unweighted_result = unweighted_scaler.fit_transform(df)
|
|
662
|
+
|
|
663
|
+
if isinstance(weighted_result, pd.DataFrame):
|
|
664
|
+
weighted_values = weighted_result["performance"].values
|
|
665
|
+
unweighted_values = unweighted_result["performance"].values
|
|
666
|
+
else:
|
|
667
|
+
weighted_values = weighted_result["performance"].to_numpy()
|
|
668
|
+
unweighted_values = unweighted_result["performance"].to_numpy()
|
|
669
|
+
|
|
670
|
+
# Results should be very close (may differ slightly due to algorithm differences)
|
|
671
|
+
assert np.allclose(weighted_values, unweighted_values, atol=0.02)
|
|
672
|
+
|
|
673
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
674
|
+
def test_weights_with_zeros_excluded(self, df_type):
|
|
675
|
+
"""Test that rows with zero weights are excluded from fitting."""
|
|
676
|
+
np.random.seed(42)
|
|
677
|
+
# Create data where zeros have zero weight
|
|
678
|
+
values = np.array([0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
|
|
679
|
+
weights = np.array([0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Zero weight for zeros
|
|
680
|
+
|
|
681
|
+
df = df_type({"performance": values, "weight": weights})
|
|
682
|
+
|
|
683
|
+
scaler = QuantilePerformanceScaler(
|
|
684
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
685
|
+
)
|
|
686
|
+
scaler.fit(df)
|
|
687
|
+
|
|
688
|
+
# Zero proportion should be 0 because zero-weight rows are excluded
|
|
689
|
+
assert scaler._zero_proportion["performance"] == 0.0
|
|
690
|
+
|
|
691
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
692
|
+
def test_weighted_zero_proportion(self, df_type):
|
|
693
|
+
"""Test that zero proportion is computed using weights."""
|
|
694
|
+
# 3 zeros with weight 10 each = 30
|
|
695
|
+
# 7 non-zeros with weight 10 each = 70
|
|
696
|
+
# Weighted zero proportion = 30/100 = 0.3
|
|
697
|
+
values = np.array([0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
|
|
698
|
+
weights = np.array([10.0] * 10)
|
|
699
|
+
|
|
700
|
+
df = df_type({"performance": values, "weight": weights})
|
|
701
|
+
|
|
702
|
+
scaler = QuantilePerformanceScaler(
|
|
703
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
704
|
+
)
|
|
705
|
+
scaler.fit(df)
|
|
706
|
+
|
|
707
|
+
assert abs(scaler._zero_proportion["performance"] - 0.3) < 1e-10
|
|
708
|
+
|
|
709
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
710
|
+
def test_weighted_zero_proportion_unequal_weights(self, df_type):
|
|
711
|
+
"""Test weighted zero proportion with unequal weights."""
|
|
712
|
+
# 2 zeros with weight 5 each = 10
|
|
713
|
+
# 2 non-zeros with weight 15 each = 30
|
|
714
|
+
# Weighted zero proportion = 10/40 = 0.25
|
|
715
|
+
values = np.array([0.0, 0.0, 1.0, 2.0])
|
|
716
|
+
weights = np.array([5.0, 5.0, 15.0, 15.0])
|
|
717
|
+
|
|
718
|
+
df = df_type({"performance": values, "weight": weights})
|
|
719
|
+
|
|
720
|
+
scaler = QuantilePerformanceScaler(
|
|
721
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
722
|
+
)
|
|
723
|
+
scaler.fit(df)
|
|
724
|
+
|
|
725
|
+
assert abs(scaler._zero_proportion["performance"] - 0.25) < 1e-10
|
|
726
|
+
|
|
727
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
728
|
+
def test_monotonicity_preserved_with_weights(self, df_type, weighted_zero_inflated_data):
|
|
729
|
+
"""Test that monotonicity is preserved with weighted scaling."""
|
|
730
|
+
values, weights = weighted_zero_inflated_data
|
|
731
|
+
df = df_type({"performance": values, "weight": weights})
|
|
732
|
+
|
|
733
|
+
scaler = QuantilePerformanceScaler(
|
|
734
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
735
|
+
)
|
|
736
|
+
transformed = scaler.fit_transform(df)
|
|
737
|
+
|
|
738
|
+
if isinstance(transformed, pd.DataFrame):
|
|
739
|
+
scaled = transformed["performance"].values
|
|
740
|
+
else:
|
|
741
|
+
scaled = transformed["performance"].to_numpy()
|
|
742
|
+
|
|
743
|
+
# Check monotonicity
|
|
744
|
+
order = np.argsort(values)
|
|
745
|
+
sorted_scaled = scaled[order]
|
|
746
|
+
assert np.all(np.diff(sorted_scaled) >= -1e-10)
|
|
747
|
+
|
|
748
|
+
@pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
|
|
749
|
+
def test_bounded_zero_one_with_weights(self, df_type, weighted_zero_inflated_data):
|
|
750
|
+
"""Test that output is bounded [0, 1] with weighted scaling."""
|
|
751
|
+
values, weights = weighted_zero_inflated_data
|
|
752
|
+
df = df_type({"performance": values, "weight": weights})
|
|
753
|
+
|
|
754
|
+
scaler = QuantilePerformanceScaler(
|
|
755
|
+
features=["performance"], prefix="", weight_column="weight"
|
|
756
|
+
)
|
|
757
|
+
transformed = scaler.fit_transform(df)
|
|
758
|
+
|
|
759
|
+
if isinstance(transformed, pd.DataFrame):
|
|
760
|
+
scaled = transformed["performance"].values
|
|
761
|
+
else:
|
|
762
|
+
scaled = transformed["performance"].to_numpy()
|
|
763
|
+
|
|
764
|
+
assert np.all((scaled >= 0) & (scaled <= 1))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|