spforge 0.8.38__py3-none-any.whl → 0.8.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

spforge/ratings/_base.py CHANGED
@@ -7,6 +7,7 @@ from abc import abstractmethod
7
7
  from typing import Any, Literal
8
8
 
9
9
  import narwhals.stable.v2 as nw
10
+ import numpy as np
10
11
  import polars as pl
11
12
  from narwhals.stable.v2 import DataFrame
12
13
  from narwhals.stable.v2.typing import IntoFrameT
@@ -149,6 +150,17 @@ class RatingGenerator(FeatureGenerator):
149
150
 
150
151
  if self.performance_manager:
151
152
  if self.performance_manager:
153
+ # Wire in participation weight column for weighted quantile scaling
154
+ # This ensures zero-inflated distributions use weights for calibration
155
+ if (
156
+ self.column_names
157
+ and self.column_names.participation_weight
158
+ and self.column_names.participation_weight in df.columns
159
+ ):
160
+ self.performance_manager.quantile_weight_column = (
161
+ self.column_names.participation_weight
162
+ )
163
+
152
164
  ori_perf_values = df[self.performance_manager.ori_performance_column].to_list()
153
165
  df = nw.from_native(self.performance_manager.fit_transform(df))
154
166
  assert (
@@ -165,7 +177,26 @@ class RatingGenerator(FeatureGenerator):
165
177
  "Either transform it manually or set auto_scale_performance to True"
166
178
  )
167
179
 
168
- if finite_perf.mean() < 0.42 or finite_perf.mean() > 0.58:
180
+ # Use weighted mean when weighted quantile scaling is active
181
+ # because the weighted mean is what's calibrated to 0.5
182
+ if (
183
+ self.performance_manager
184
+ and self.performance_manager._using_quantile_scaler
185
+ and self.performance_manager.quantile_weight_column
186
+ and self.performance_manager.quantile_weight_column in df.columns
187
+ ):
188
+ weights = df[self.performance_manager.quantile_weight_column]
189
+ valid_mask = perf.is_finite() & weights.is_finite() & (weights > 0)
190
+ if valid_mask.sum() > 0:
191
+ perf_values = perf.filter(valid_mask).to_numpy()
192
+ weight_values = weights.filter(valid_mask).to_numpy()
193
+ mean_val = float(np.average(perf_values, weights=weight_values))
194
+ else:
195
+ mean_val = float(finite_perf.mean())
196
+ else:
197
+ mean_val = float(finite_perf.mean())
198
+
199
+ if mean_val < 0.42 or mean_val > 0.58:
169
200
  raise ValueError(
170
201
  f"Mean {self.performance_column} must be between 0.42 and 0.58. "
171
202
  "Either transform it manually or set auto_scale_performance to True"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.38
3
+ Version: 0.8.39
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -50,7 +50,7 @@ spforge/performance_transformers/__init__.py,sha256=J-5olqi1M_BUj3sN1NqAz9s28XAb
50
50
  spforge/performance_transformers/_performance_manager.py,sha256=lh7enqYLd1lXj1VTOiK5N880xkil5q1jRsM51fe_K5g,12322
51
51
  spforge/performance_transformers/_performances_transformers.py,sha256=nmjJTEH86JjFneWsnSWIYnUXQoUDskOraDO3VtuufIY,20931
52
52
  spforge/ratings/__init__.py,sha256=OZVH2Lo6END3n1X8qi4QcyAPlThIwAYwVKCiIuOQSQU,576
53
- spforge/ratings/_base.py,sha256=Z-VVXWmnzR0O7o2_Q2x2ru_3uiTMpWqKDGQaNBJxCMA,14927
53
+ spforge/ratings/_base.py,sha256=Stl_Y2gjQfS1jq_6CfeRG_e3R5Pei34WETdG6CaibGs,16487
54
54
  spforge/ratings/_player_rating.py,sha256=AIpDEl6cZaC3urcY-jFFgUWd4WZ71A33c5mOPfkXdMs,68178
55
55
  spforge/ratings/_team_rating.py,sha256=3m90-R2zW0k5EHwjw-83Hacz91fGmxW1LQ8ZUGHlgt4,24970
56
56
  spforge/ratings/enums.py,sha256=maG0X4WMQeMVAc2wbceq1an-U-z8moZGeG2BAgfICDA,1809
@@ -71,7 +71,7 @@ spforge/transformers/_other_transformer.py,sha256=w2a7Wnki3vJe4GAkSa4kealw0GILIo
71
71
  spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
72
72
  spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
73
73
  spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
74
- spforge-0.8.38.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
74
+ spforge-0.8.39.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
75
75
  tests/test_autopipeline.py,sha256=7cNAn-nmGolfyfk3THh9IKcHZfRA-pLYC_xAyMg-No4,26863
76
76
  tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
77
77
  tests/test_feature_generator_pipeline.py,sha256=CK0zVL8PfTncy3RmG9i-YpgwjOIV7yJhV7Q44tbetI8,19020
@@ -92,8 +92,8 @@ tests/feature_generator/test_rolling_mean_days.py,sha256=EyOvdJDnmgPfe13uQBOkwo7
92
92
  tests/feature_generator/test_rolling_window.py,sha256=_o9oljcAIZ14iI7e8WFeAsfXxILnyqBffit21HOvII4,24378
93
93
  tests/hyperparameter_tuning/test_estimator_tuner.py,sha256=iewME41d6LR2aQ0OtohGFtN_ocJUwTeqvs6L0QDmfG4,4413
94
94
  tests/hyperparameter_tuning/test_rating_tuner.py,sha256=usjC2ioO_yWRjjNAlRTyMVYheOrCi0kKocmHQHdTmpM,18699
95
- tests/performance_transformers/test_performance_manager.py,sha256=tHyyyjAotJLtrViWx3j0DaNDqp3nowooMu5Wop7DjBE,24766
96
- tests/performance_transformers/test_performances_transformers.py,sha256=QyLNzis7yOBsjjclhuYrbZFaSmlTcSAbGVzNvK1B-SU,27817
95
+ tests/performance_transformers/test_performance_manager.py,sha256=Ob4s86hdnR_4RC9ZG3lpB5O4Gysr2cLyTmCsO6uWomc,21244
96
+ tests/performance_transformers/test_performances_transformers.py,sha256=2OLpFgBolU8e-1Pga3hiOGWWHhjYpfx8Qrf9YXiqjUw,20919
97
97
  tests/ratings/test_player_rating_generator.py,sha256=1Pkx0H8xJMTeLc2Fu9zJcoDpBWiY2zCVSxuBFJk2uEs,110717
98
98
  tests/ratings/test_player_rating_no_mutation.py,sha256=GzO3Hl__5K68DS3uRLefwnbcTJOvBM7cZqww4M21UZM,8493
99
99
  tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
@@ -108,7 +108,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
108
108
  tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
109
109
  tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
110
110
  tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
111
- spforge-0.8.38.dist-info/METADATA,sha256=XXk1_WwD1gWvzRk08OSagsR6_w0qJAjcWX57-fwL9rg,20048
112
- spforge-0.8.38.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
113
- spforge-0.8.38.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
114
- spforge-0.8.38.dist-info/RECORD,,
111
+ spforge-0.8.39.dist-info/METADATA,sha256=njbTQ33nwPOZ71PhHQDxUWZzP4MjSavx8sT-JgK2fio,20048
112
+ spforge-0.8.39.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
113
+ spforge-0.8.39.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
114
+ spforge-0.8.39.dist-info/RECORD,,
@@ -437,143 +437,48 @@ class TestZeroInflationHandling:
437
437
 
438
438
 
439
439
  class TestWeightedQuantileScaling:
440
- """Tests for weighted quantile scaling in PerformanceManager."""
440
+ """Test that RatingGenerator wires participation weights to quantile scaling."""
441
441
 
442
- @pytest.fixture
443
- def weighted_zero_inflated_data(self):
444
- """Create zero-inflated data where high-weight rows have higher non-zero rate."""
445
- np.random.seed(42)
446
- n = 1000
447
-
448
- # Create weights (e.g., minutes played)
449
- weights = np.random.exponential(scale=20, size=n) + 1
450
-
451
- # High-weight rows have lower zero probability
452
- values = []
453
- for w in weights:
454
- zero_prob = 0.6 - 0.4 * (w / weights.max())
455
- if np.random.random() < zero_prob:
456
- values.append(0.0)
457
- else:
458
- values.append(np.random.exponential(scale=2))
459
-
460
- return np.array(values), weights
461
-
462
- @pytest.mark.parametrize("frame", ["pd", "pl"])
463
- def test_performance_manager_with_weight_column(self, frame, weighted_zero_inflated_data):
464
- """Test that PerformanceManager passes weight column to QuantilePerformanceScaler."""
465
- values, weights = weighted_zero_inflated_data
466
- df = _make_native_df(frame, {"x": values, "minutes": weights})
467
-
468
- pm = PerformanceManager(
469
- features=["x"],
470
- transformer_names=None, # Use defaults, auto-detect zero inflation
471
- prefix="performance__",
472
- performance_column="perf",
473
- zero_inflation_threshold=0.15,
474
- quantile_weight_column="minutes",
475
- )
476
-
477
- pm.fit(df)
478
-
479
- # Should have switched to quantile scaler
480
- assert pm._using_quantile_scaler is True
481
- assert isinstance(pm.transformers[-1], QuantilePerformanceScaler)
482
- # And should have the weight column set
483
- assert pm.transformers[-1].weight_column == "minutes"
484
-
485
- @pytest.mark.parametrize("frame", ["pd", "pl"])
486
- def test_weighted_scaling_reduces_weighted_bias(self, frame, weighted_zero_inflated_data):
487
- """Test that weighted scaling produces weighted mean closer to 0.5."""
488
- values, weights = weighted_zero_inflated_data
489
- df = _make_native_df(frame, {"x": values, "minutes": weights})
490
-
491
- # With weighted scaling
492
- pm_weighted = PerformanceManager(
493
- features=["x"],
494
- transformer_names=None,
495
- prefix="performance__",
496
- performance_column="perf",
497
- zero_inflation_threshold=0.15,
498
- quantile_weight_column="minutes",
499
- )
500
-
501
- result_weighted = pm_weighted.fit_transform(df)
502
- result_weighted_nw = nw.from_native(result_weighted)
503
- scaled_weighted = result_weighted_nw["performance__perf"].to_numpy()
504
-
505
- # Without weighted scaling
506
- pm_unweighted = PerformanceManager(
507
- features=["x"],
508
- transformer_names=None,
509
- prefix="performance__",
510
- performance_column="perf",
511
- zero_inflation_threshold=0.15,
512
- quantile_weight_column=None, # No weighting
513
- )
514
-
515
- result_unweighted = pm_unweighted.fit_transform(df)
516
- result_unweighted_nw = nw.from_native(result_unweighted)
517
- scaled_unweighted = result_unweighted_nw["performance__perf"].to_numpy()
518
-
519
- # Compute weighted means
520
- weighted_mean_of_weighted = np.average(scaled_weighted, weights=weights)
521
- weighted_mean_of_unweighted = np.average(scaled_unweighted, weights=weights)
522
-
523
- # Weighted scaling should have weighted mean closer to 0.5
524
- assert abs(weighted_mean_of_weighted - 0.5) < abs(weighted_mean_of_unweighted - 0.5), (
525
- f"Weighted mean with weighted scaling ({weighted_mean_of_weighted:.4f}) "
526
- f"should be closer to 0.5 than without ({weighted_mean_of_unweighted:.4f})"
527
- )
528
-
529
- @pytest.mark.parametrize("frame", ["pd", "pl"])
530
- def test_performance_weights_manager_with_quantile_weight_column(
531
- self, frame, weighted_zero_inflated_data
532
- ):
533
- """Test that PerformanceWeightsManager also supports quantile_weight_column."""
534
- from spforge.performance_transformers._performance_manager import ColumnWeight
535
-
536
- values, weights = weighted_zero_inflated_data
537
- df = _make_native_df(frame, {"feat_a": values, "minutes": weights})
442
+ def test_rating_generator_wires_weight_column(self):
443
+ """
444
+ RatingGenerator should automatically wire participation_weight to
445
+ quantile_weight_column when using auto_scale_performance with zero-inflated data.
446
+ """
447
+ from spforge import ColumnNames
448
+ from spforge.ratings import PlayerRatingGenerator
538
449
 
539
- column_weights = [ColumnWeight(name="feat_a", weight=1.0)]
540
- manager = PerformanceWeightsManager(
541
- weights=column_weights,
542
- transformer_names=None,
543
- prefix="",
544
- zero_inflation_threshold=0.15,
545
- quantile_weight_column="minutes",
546
- )
547
-
548
- manager.fit(df)
549
-
550
- # Should have switched to quantile scaler with weight column
551
- assert manager._using_quantile_scaler is True
552
- assert manager.transformers[-1].weight_column == "minutes"
553
-
554
- @pytest.mark.parametrize("frame", ["pd", "pl"])
555
- def test_weight_column_not_used_when_no_zero_inflation(self, frame):
556
- """Test that weight column is not needed when zero inflation is not detected."""
557
450
  np.random.seed(42)
558
- # Normal distribution - no zero inflation
559
- data = np.random.normal(loc=0.5, scale=0.1, size=1000)
560
- weights = np.random.exponential(scale=20, size=1000) + 1
561
-
562
- df = _make_native_df(frame, {"x": data, "minutes": weights})
563
-
564
- pm = PerformanceManager(
565
- features=["x"],
566
- transformer_names=None,
567
- prefix="performance__",
568
- performance_column="perf",
569
- zero_inflation_threshold=0.15,
570
- quantile_weight_column="minutes",
451
+ data = {"player_id": [], "team_id": [], "match_id": [], "start_date": [], "perf": [], "minutes": []}
452
+
453
+ for match_idx in range(50):
454
+ date = f"2024-{(match_idx // 28) + 1:02d}-{(match_idx % 28) + 1:02d}"
455
+ for team_idx in range(2):
456
+ for player_idx in range(5):
457
+ minutes = min(np.random.exponential(scale=20) + 5, 48)
458
+ # Zero-inflated: high-minutes players more likely non-zero
459
+ zero_prob = 0.7 - 0.5 * (minutes / 48)
460
+ perf = 0.0 if np.random.random() < zero_prob else np.random.exponential(0.1)
461
+
462
+ data["player_id"].append(f"P{team_idx}_{player_idx}")
463
+ data["team_id"].append(f"T{team_idx}")
464
+ data["match_id"].append(f"M{match_idx}")
465
+ data["start_date"].append(date)
466
+ data["perf"].append(perf)
467
+ data["minutes"].append(minutes / 48)
468
+
469
+ cn = ColumnNames(
470
+ player_id="player_id", team_id="team_id", match_id="match_id",
471
+ start_date="start_date", update_match_id="match_id", participation_weight="minutes",
571
472
  )
572
473
 
573
- pm.fit(df)
474
+ gen = PlayerRatingGenerator(performance_column="perf", column_names=cn, auto_scale_performance=True)
475
+ gen.fit_transform(pl.DataFrame(data))
574
476
 
575
- # Should NOT have switched to quantile scaler
576
- assert pm._using_quantile_scaler is False
477
+ pm = gen.performance_manager
478
+ if pm._using_quantile_scaler:
479
+ assert pm.transformers[-1].weight_column == "minutes", (
480
+ "RatingGenerator should wire quantile_weight_column to participation_weight"
481
+ )
577
482
 
578
483
 
579
484
  class TestAutoScalePerformanceBounds:
@@ -554,211 +554,38 @@ class TestQuantilePerformanceScaler:
554
554
 
555
555
 
556
556
  class TestWeightedQuantilePerformanceScaler:
557
- """Tests for weighted quantile scaling."""
557
+ """Tests for weighted quantile scaling algorithm."""
558
558
 
559
- @pytest.fixture
560
- def weighted_zero_inflated_data(self):
561
- """Create zero-inflated data where high-weight rows have higher non-zero rate."""
559
+ @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
560
+ def test_weighted_mean_alignment(self, df_type):
561
+ """Weighted scaling should produce weighted mean close to 0.5."""
562
562
  np.random.seed(42)
563
563
  n = 1000
564
-
565
- # Create weights (e.g., minutes played)
566
- weights = np.random.exponential(scale=20, size=n) + 1 # 1 to ~100
567
-
568
- # High-weight rows have lower zero probability
569
- # This simulates: players with more minutes are more likely to have non-zero stats
564
+ weights = np.random.exponential(scale=20, size=n) + 1
570
565
  values = []
571
566
  for w in weights:
572
- # Zero probability decreases as weight increases
573
- zero_prob = 0.6 - 0.4 * (w / weights.max()) # 0.2 to 0.6
574
- if np.random.random() < zero_prob:
575
- values.append(0.0)
576
- else:
577
- values.append(np.random.exponential(scale=2))
578
-
579
- return np.array(values), weights
580
-
581
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
582
- def test_weighted_mean_alignment(self, df_type, weighted_zero_inflated_data):
583
- """Test that weighted mean of scaled values is closer to 0.5 with weighted scaling."""
584
- values, weights = weighted_zero_inflated_data
585
- df = df_type({"performance": values, "weight": weights})
586
-
587
- # Weighted scaler
588
- weighted_scaler = QuantilePerformanceScaler(
589
- features=["performance"], prefix="", weight_column="weight"
590
- )
591
- weighted_transformed = weighted_scaler.fit_transform(df)
592
-
593
- if isinstance(weighted_transformed, pd.DataFrame):
594
- weighted_scaled = weighted_transformed["performance"].values
595
- else:
596
- weighted_scaled = weighted_transformed["performance"].to_numpy()
567
+ zero_prob = 0.6 - 0.4 * (w / weights.max())
568
+ values.append(0.0 if np.random.random() < zero_prob else np.random.exponential(scale=2))
597
569
 
598
- # Compute weighted mean
599
- weighted_mean = np.average(weighted_scaled, weights=weights)
570
+ df = df_type({"performance": np.array(values), "weight": weights})
571
+ scaler = QuantilePerformanceScaler(features=["performance"], prefix="", weight_column="weight")
572
+ result = scaler.fit_transform(df)
600
573
 
601
- # Weighted scaling should have weighted mean close to 0.5
602
- assert abs(weighted_mean - 0.5) < 0.02, (
603
- f"Weighted mean should be close to 0.5, got {weighted_mean}"
604
- )
574
+ scaled = result["performance"].values if isinstance(result, pd.DataFrame) else result["performance"].to_numpy()
575
+ weighted_mean = np.average(scaled, weights=weights)
576
+ assert abs(weighted_mean - 0.5) < 0.02
605
577
 
606
578
  @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
607
579
  def test_backward_compatibility_without_weights(self, df_type):
608
- """Test that weight_column=None matches original unweighted behavior."""
580
+ """weight_column=None should match original unweighted behavior."""
609
581
  np.random.seed(42)
610
- n = 500
611
- zeros = np.zeros(200)
612
- nonzeros = np.random.exponential(scale=2, size=n - 200)
613
- raw = np.concatenate([zeros, nonzeros])
582
+ raw = np.concatenate([np.zeros(200), np.random.exponential(scale=2, size=300)])
614
583
  np.random.shuffle(raw)
615
-
616
584
  df = df_type({"performance": raw})
617
585
 
618
- # Unweighted scaler (explicitly None)
619
- unweighted_scaler = QuantilePerformanceScaler(
620
- features=["performance"], prefix="", weight_column=None
621
- )
622
- unweighted_result = unweighted_scaler.fit_transform(df)
623
-
624
- # Scaler without weight_column argument
625
- default_scaler = QuantilePerformanceScaler(features=["performance"], prefix="")
626
- default_result = default_scaler.fit_transform(df)
627
-
628
- if isinstance(unweighted_result, pd.DataFrame):
629
- unweighted_values = unweighted_result["performance"].values
630
- default_values = default_result["performance"].values
631
- else:
632
- unweighted_values = unweighted_result["performance"].to_numpy()
633
- default_values = default_result["performance"].to_numpy()
634
-
635
- # Results should be identical
636
- assert np.allclose(unweighted_values, default_values, atol=1e-10)
637
-
638
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
639
- def test_equal_weights_matches_unweighted(self, df_type):
640
- """Test that all equal weights produce same result as unweighted."""
641
- np.random.seed(42)
642
- n = 500
643
- zeros = np.zeros(200)
644
- nonzeros = np.random.exponential(scale=2, size=n - 200)
645
- raw = np.concatenate([zeros, nonzeros])
646
- np.random.shuffle(raw)
647
- equal_weights = np.ones(n)
648
-
649
- df = df_type({"performance": raw, "weight": equal_weights})
650
-
651
- # Weighted scaler with equal weights
652
- weighted_scaler = QuantilePerformanceScaler(
653
- features=["performance"], prefix="", weight_column="weight"
654
- )
655
- weighted_result = weighted_scaler.fit_transform(df)
656
-
657
- # Unweighted scaler
658
- unweighted_scaler = QuantilePerformanceScaler(
659
- features=["performance"], prefix="", weight_column=None
660
- )
661
- unweighted_result = unweighted_scaler.fit_transform(df)
662
-
663
- if isinstance(weighted_result, pd.DataFrame):
664
- weighted_values = weighted_result["performance"].values
665
- unweighted_values = unweighted_result["performance"].values
666
- else:
667
- weighted_values = weighted_result["performance"].to_numpy()
668
- unweighted_values = unweighted_result["performance"].to_numpy()
669
-
670
- # Results should be very close (may differ slightly due to algorithm differences)
671
- assert np.allclose(weighted_values, unweighted_values, atol=0.02)
672
-
673
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
674
- def test_weights_with_zeros_excluded(self, df_type):
675
- """Test that rows with zero weights are excluded from fitting."""
676
- np.random.seed(42)
677
- # Create data where zeros have zero weight
678
- values = np.array([0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0])
679
- weights = np.array([0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0]) # Zero weight for zeros
680
-
681
- df = df_type({"performance": values, "weight": weights})
586
+ result1 = QuantilePerformanceScaler(features=["performance"], prefix="", weight_column=None).fit_transform(df)
587
+ result2 = QuantilePerformanceScaler(features=["performance"], prefix="").fit_transform(df)
682
588
 
683
- scaler = QuantilePerformanceScaler(
684
- features=["performance"], prefix="", weight_column="weight"
685
- )
686
- scaler.fit(df)
687
-
688
- # Zero proportion should be 0 because zero-weight rows are excluded
689
- assert scaler._zero_proportion["performance"] == 0.0
690
-
691
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
692
- def test_weighted_zero_proportion(self, df_type):
693
- """Test that zero proportion is computed using weights."""
694
- # 3 zeros with weight 10 each = 30
695
- # 7 non-zeros with weight 10 each = 70
696
- # Weighted zero proportion = 30/100 = 0.3
697
- values = np.array([0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
698
- weights = np.array([10.0] * 10)
699
-
700
- df = df_type({"performance": values, "weight": weights})
701
-
702
- scaler = QuantilePerformanceScaler(
703
- features=["performance"], prefix="", weight_column="weight"
704
- )
705
- scaler.fit(df)
706
-
707
- assert abs(scaler._zero_proportion["performance"] - 0.3) < 1e-10
708
-
709
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
710
- def test_weighted_zero_proportion_unequal_weights(self, df_type):
711
- """Test weighted zero proportion with unequal weights."""
712
- # 2 zeros with weight 5 each = 10
713
- # 2 non-zeros with weight 15 each = 30
714
- # Weighted zero proportion = 10/40 = 0.25
715
- values = np.array([0.0, 0.0, 1.0, 2.0])
716
- weights = np.array([5.0, 5.0, 15.0, 15.0])
717
-
718
- df = df_type({"performance": values, "weight": weights})
719
-
720
- scaler = QuantilePerformanceScaler(
721
- features=["performance"], prefix="", weight_column="weight"
722
- )
723
- scaler.fit(df)
724
-
725
- assert abs(scaler._zero_proportion["performance"] - 0.25) < 1e-10
726
-
727
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
728
- def test_monotonicity_preserved_with_weights(self, df_type, weighted_zero_inflated_data):
729
- """Test that monotonicity is preserved with weighted scaling."""
730
- values, weights = weighted_zero_inflated_data
731
- df = df_type({"performance": values, "weight": weights})
732
-
733
- scaler = QuantilePerformanceScaler(
734
- features=["performance"], prefix="", weight_column="weight"
735
- )
736
- transformed = scaler.fit_transform(df)
737
-
738
- if isinstance(transformed, pd.DataFrame):
739
- scaled = transformed["performance"].values
740
- else:
741
- scaled = transformed["performance"].to_numpy()
742
-
743
- # Check monotonicity
744
- order = np.argsort(values)
745
- sorted_scaled = scaled[order]
746
- assert np.all(np.diff(sorted_scaled) >= -1e-10)
747
-
748
- @pytest.mark.parametrize("df_type", [pd.DataFrame, pl.DataFrame])
749
- def test_bounded_zero_one_with_weights(self, df_type, weighted_zero_inflated_data):
750
- """Test that output is bounded [0, 1] with weighted scaling."""
751
- values, weights = weighted_zero_inflated_data
752
- df = df_type({"performance": values, "weight": weights})
753
-
754
- scaler = QuantilePerformanceScaler(
755
- features=["performance"], prefix="", weight_column="weight"
756
- )
757
- transformed = scaler.fit_transform(df)
758
-
759
- if isinstance(transformed, pd.DataFrame):
760
- scaled = transformed["performance"].values
761
- else:
762
- scaled = transformed["performance"].to_numpy()
763
-
764
- assert np.all((scaled >= 0) & (scaled <= 1))
589
+ v1 = result1["performance"].values if isinstance(result1, pd.DataFrame) else result1["performance"].to_numpy()
590
+ v2 = result2["performance"].values if isinstance(result2, pd.DataFrame) else result2["performance"].to_numpy()
591
+ assert np.allclose(v1, v2, atol=1e-10)