spforge 0.8.8__py3-none-any.whl → 0.8.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of spforge might be problematic. Click here for more details.
- spforge/autopipeline.py +169 -5
- spforge/estimator/_group_by_estimator.py +11 -3
- spforge/performance_transformers/_performance_manager.py +2 -4
- spforge/ratings/_player_rating.py +131 -28
- spforge/ratings/start_rating_generator.py +1 -1
- spforge/ratings/team_start_rating_generator.py +1 -1
- spforge/ratings/utils.py +16 -6
- spforge/scorer/_score.py +42 -11
- spforge/transformers/_other_transformer.py +38 -8
- {spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/METADATA +1 -1
- {spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/RECORD +20 -18
- {spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/WHEEL +1 -1
- tests/performance_transformers/test_performance_manager.py +15 -0
- tests/ratings/test_player_rating_generator.py +127 -0
- tests/ratings/test_player_rating_no_mutation.py +214 -0
- tests/ratings/test_utils_scaled_weights.py +136 -0
- tests/scorer/test_score.py +142 -0
- tests/test_autopipeline.py +336 -6
- {spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/licenses/LICENSE +0 -0
- {spforge-0.8.8.dist-info → spforge-0.8.18.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Tests to ensure PlayerRatingGenerator does not mutate input columns."""
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from spforge import ColumnNames
|
|
7
|
+
from spforge.ratings import PlayerRatingGenerator, RatingKnownFeatures
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture
|
|
11
|
+
def cn_with_projected():
|
|
12
|
+
"""ColumnNames with both participation_weight and projected_participation_weight."""
|
|
13
|
+
return ColumnNames(
|
|
14
|
+
player_id="pid",
|
|
15
|
+
team_id="tid",
|
|
16
|
+
match_id="mid",
|
|
17
|
+
start_date="dt",
|
|
18
|
+
update_match_id="mid",
|
|
19
|
+
participation_weight="minutes",
|
|
20
|
+
projected_participation_weight="minutes_prediction",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def fit_df():
|
|
26
|
+
"""Training data with minutes > 1 (will trigger auto-scaling)."""
|
|
27
|
+
return pl.DataFrame(
|
|
28
|
+
{
|
|
29
|
+
"pid": ["P1", "P2", "P3", "P4"],
|
|
30
|
+
"tid": ["T1", "T1", "T2", "T2"],
|
|
31
|
+
"mid": ["M1", "M1", "M1", "M1"],
|
|
32
|
+
"dt": ["2024-01-01"] * 4,
|
|
33
|
+
"perf": [0.6, 0.4, 0.7, 0.3],
|
|
34
|
+
"minutes": [30.0, 25.0, 32.0, 28.0],
|
|
35
|
+
"minutes_prediction": [28.0, 24.0, 30.0, 26.0],
|
|
36
|
+
}
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@pytest.fixture
|
|
41
|
+
def future_df():
|
|
42
|
+
"""Future prediction data with minutes > 1 (will trigger auto-scaling)."""
|
|
43
|
+
return pl.DataFrame(
|
|
44
|
+
{
|
|
45
|
+
"pid": ["P1", "P2", "P3", "P4"],
|
|
46
|
+
"tid": ["T1", "T1", "T2", "T2"],
|
|
47
|
+
"mid": ["M2", "M2", "M2", "M2"],
|
|
48
|
+
"dt": ["2024-01-02"] * 4,
|
|
49
|
+
"minutes": [30.0, 25.0, 32.0, 28.0],
|
|
50
|
+
"minutes_prediction": [28.0, 24.0, 30.0, 26.0],
|
|
51
|
+
}
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_fit_transform_does_not_mutate_participation_weight(cn_with_projected, fit_df):
|
|
56
|
+
"""fit_transform should not modify the participation_weight column values."""
|
|
57
|
+
# Join result with original to compare values by player_id
|
|
58
|
+
gen = PlayerRatingGenerator(
|
|
59
|
+
performance_column="perf",
|
|
60
|
+
column_names=cn_with_projected,
|
|
61
|
+
auto_scale_performance=True,
|
|
62
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
63
|
+
)
|
|
64
|
+
result = gen.fit_transform(fit_df)
|
|
65
|
+
|
|
66
|
+
# Check that each player's minutes value is preserved
|
|
67
|
+
original_by_player = dict(zip(fit_df["pid"].to_list(), fit_df["minutes"].to_list()))
|
|
68
|
+
result_by_player = dict(zip(result["pid"].to_list(), result["minutes"].to_list()))
|
|
69
|
+
|
|
70
|
+
for pid, original_val in original_by_player.items():
|
|
71
|
+
result_val = result_by_player[pid]
|
|
72
|
+
assert result_val == original_val, (
|
|
73
|
+
f"participation_weight for player {pid} was mutated. "
|
|
74
|
+
f"Expected {original_val}, got {result_val}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_fit_transform_does_not_mutate_projected_participation_weight(cn_with_projected, fit_df):
|
|
79
|
+
"""fit_transform should not modify the projected_participation_weight column values."""
|
|
80
|
+
gen = PlayerRatingGenerator(
|
|
81
|
+
performance_column="perf",
|
|
82
|
+
column_names=cn_with_projected,
|
|
83
|
+
auto_scale_performance=True,
|
|
84
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
85
|
+
)
|
|
86
|
+
result = gen.fit_transform(fit_df)
|
|
87
|
+
|
|
88
|
+
# Check that each player's minutes_prediction value is preserved
|
|
89
|
+
original_by_player = dict(zip(fit_df["pid"].to_list(), fit_df["minutes_prediction"].to_list()))
|
|
90
|
+
result_by_player = dict(zip(result["pid"].to_list(), result["minutes_prediction"].to_list()))
|
|
91
|
+
|
|
92
|
+
for pid, original_val in original_by_player.items():
|
|
93
|
+
result_val = result_by_player[pid]
|
|
94
|
+
assert result_val == original_val, (
|
|
95
|
+
f"projected_participation_weight for player {pid} was mutated. "
|
|
96
|
+
f"Expected {original_val}, got {result_val}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_transform_does_not_mutate_participation_weight(cn_with_projected, fit_df, future_df):
|
|
101
|
+
"""transform should not modify the participation_weight column values."""
|
|
102
|
+
gen = PlayerRatingGenerator(
|
|
103
|
+
performance_column="perf",
|
|
104
|
+
column_names=cn_with_projected,
|
|
105
|
+
auto_scale_performance=True,
|
|
106
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
107
|
+
)
|
|
108
|
+
gen.fit_transform(fit_df)
|
|
109
|
+
|
|
110
|
+
result = gen.transform(future_df)
|
|
111
|
+
|
|
112
|
+
# Check that each player's minutes value is preserved
|
|
113
|
+
original_by_player = dict(zip(future_df["pid"].to_list(), future_df["minutes"].to_list()))
|
|
114
|
+
result_by_player = dict(zip(result["pid"].to_list(), result["minutes"].to_list()))
|
|
115
|
+
|
|
116
|
+
for pid, original_val in original_by_player.items():
|
|
117
|
+
result_val = result_by_player[pid]
|
|
118
|
+
assert result_val == original_val, (
|
|
119
|
+
f"participation_weight for player {pid} was mutated during transform. "
|
|
120
|
+
f"Expected {original_val}, got {result_val}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def test_transform_does_not_mutate_projected_participation_weight(cn_with_projected, fit_df, future_df):
|
|
125
|
+
"""transform should not modify the projected_participation_weight column values."""
|
|
126
|
+
gen = PlayerRatingGenerator(
|
|
127
|
+
performance_column="perf",
|
|
128
|
+
column_names=cn_with_projected,
|
|
129
|
+
auto_scale_performance=True,
|
|
130
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
131
|
+
)
|
|
132
|
+
gen.fit_transform(fit_df)
|
|
133
|
+
|
|
134
|
+
result = gen.transform(future_df)
|
|
135
|
+
|
|
136
|
+
# Check that each player's minutes_prediction value is preserved
|
|
137
|
+
original_by_player = dict(zip(future_df["pid"].to_list(), future_df["minutes_prediction"].to_list()))
|
|
138
|
+
result_by_player = dict(zip(result["pid"].to_list(), result["minutes_prediction"].to_list()))
|
|
139
|
+
|
|
140
|
+
for pid, original_val in original_by_player.items():
|
|
141
|
+
result_val = result_by_player[pid]
|
|
142
|
+
assert result_val == original_val, (
|
|
143
|
+
f"projected_participation_weight for player {pid} was mutated during transform. "
|
|
144
|
+
f"Expected {original_val}, got {result_val}"
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def test_future_transform_does_not_mutate_participation_weight(cn_with_projected, fit_df, future_df):
|
|
149
|
+
"""future_transform should not modify the participation_weight column values."""
|
|
150
|
+
gen = PlayerRatingGenerator(
|
|
151
|
+
performance_column="perf",
|
|
152
|
+
column_names=cn_with_projected,
|
|
153
|
+
auto_scale_performance=True,
|
|
154
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
155
|
+
)
|
|
156
|
+
gen.fit_transform(fit_df)
|
|
157
|
+
|
|
158
|
+
original_minutes = future_df["minutes"].to_list()
|
|
159
|
+
result = gen.future_transform(future_df)
|
|
160
|
+
|
|
161
|
+
# The minutes column should have the same values as before
|
|
162
|
+
result_minutes = result["minutes"].to_list()
|
|
163
|
+
assert result_minutes == original_minutes, (
|
|
164
|
+
f"participation_weight column was mutated during future_transform. "
|
|
165
|
+
f"Expected {original_minutes}, got {result_minutes}"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def test_future_transform_does_not_mutate_projected_participation_weight(cn_with_projected, fit_df, future_df):
|
|
170
|
+
"""future_transform should not modify the projected_participation_weight column values."""
|
|
171
|
+
gen = PlayerRatingGenerator(
|
|
172
|
+
performance_column="perf",
|
|
173
|
+
column_names=cn_with_projected,
|
|
174
|
+
auto_scale_performance=True,
|
|
175
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
176
|
+
)
|
|
177
|
+
gen.fit_transform(fit_df)
|
|
178
|
+
|
|
179
|
+
original_minutes_pred = future_df["minutes_prediction"].to_list()
|
|
180
|
+
result = gen.future_transform(future_df)
|
|
181
|
+
|
|
182
|
+
# The minutes_prediction column should have the same values as before
|
|
183
|
+
result_minutes_pred = result["minutes_prediction"].to_list()
|
|
184
|
+
assert result_minutes_pred == original_minutes_pred, (
|
|
185
|
+
f"projected_participation_weight column was mutated during future_transform. "
|
|
186
|
+
f"Expected {original_minutes_pred}, got {result_minutes_pred}"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def test_multiple_transforms_do_not_compound_scaling(cn_with_projected, fit_df, future_df):
|
|
191
|
+
"""Multiple transform calls should not compound the scaling effect."""
|
|
192
|
+
gen = PlayerRatingGenerator(
|
|
193
|
+
performance_column="perf",
|
|
194
|
+
column_names=cn_with_projected,
|
|
195
|
+
auto_scale_performance=True,
|
|
196
|
+
features_out=[RatingKnownFeatures.PLAYER_OFF_RATING],
|
|
197
|
+
)
|
|
198
|
+
gen.fit_transform(fit_df)
|
|
199
|
+
|
|
200
|
+
# Call transform multiple times
|
|
201
|
+
result1 = gen.transform(future_df)
|
|
202
|
+
result2 = gen.transform(result1)
|
|
203
|
+
result3 = gen.transform(result2)
|
|
204
|
+
|
|
205
|
+
# After 3 transforms, each player's values should still be the same as original
|
|
206
|
+
original_by_player = dict(zip(future_df["pid"].to_list(), future_df["minutes_prediction"].to_list()))
|
|
207
|
+
final_by_player = dict(zip(result3["pid"].to_list(), result3["minutes_prediction"].to_list()))
|
|
208
|
+
|
|
209
|
+
for pid, original_val in original_by_player.items():
|
|
210
|
+
final_val = final_by_player[pid]
|
|
211
|
+
assert final_val == original_val, (
|
|
212
|
+
f"Multiple transforms compounded the scaling for player {pid}. "
|
|
213
|
+
f"Expected {original_val}, got {final_val}"
|
|
214
|
+
)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Tests to ensure utility functions use scaled participation weights when available."""
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from spforge import ColumnNames
|
|
7
|
+
from spforge.ratings.utils import (
|
|
8
|
+
_SCALED_PPW,
|
|
9
|
+
add_team_rating_projected,
|
|
10
|
+
add_rating_mean_projected,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.fixture
|
|
15
|
+
def column_names():
|
|
16
|
+
return ColumnNames(
|
|
17
|
+
player_id="pid",
|
|
18
|
+
team_id="tid",
|
|
19
|
+
match_id="mid",
|
|
20
|
+
start_date="dt",
|
|
21
|
+
projected_participation_weight="ppw",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@pytest.fixture
|
|
26
|
+
def df_with_scaled():
|
|
27
|
+
"""DataFrame with both raw and scaled projected participation weights."""
|
|
28
|
+
return pl.DataFrame({
|
|
29
|
+
"pid": ["A", "B", "C", "D"],
|
|
30
|
+
"tid": ["T1", "T1", "T2", "T2"],
|
|
31
|
+
"mid": ["M1", "M1", "M1", "M1"],
|
|
32
|
+
"dt": ["2024-01-01"] * 4,
|
|
33
|
+
"rating": [1100.0, 900.0, 1050.0, 950.0],
|
|
34
|
+
"ppw": [20.0, 5.0, 10.0, 10.0], # Raw weights (would give wrong answer)
|
|
35
|
+
_SCALED_PPW: [1.0, 0.5, 1.0, 1.0], # Scaled/clipped weights
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.fixture
|
|
40
|
+
def df_without_scaled():
|
|
41
|
+
"""DataFrame with only raw projected participation weights (no scaled column)."""
|
|
42
|
+
return pl.DataFrame({
|
|
43
|
+
"pid": ["A", "B", "C", "D"],
|
|
44
|
+
"tid": ["T1", "T1", "T2", "T2"],
|
|
45
|
+
"mid": ["M1", "M1", "M1", "M1"],
|
|
46
|
+
"dt": ["2024-01-01"] * 4,
|
|
47
|
+
"rating": [1100.0, 900.0, 1050.0, 950.0],
|
|
48
|
+
"ppw": [0.8, 0.4, 1.0, 1.0], # Already scaled weights
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_add_team_rating_projected_uses_scaled_column(column_names, df_with_scaled):
|
|
53
|
+
"""add_team_rating_projected should use _SCALED_PPW when available."""
|
|
54
|
+
result = add_team_rating_projected(
|
|
55
|
+
df=df_with_scaled,
|
|
56
|
+
column_names=column_names,
|
|
57
|
+
player_rating_col="rating",
|
|
58
|
+
team_rating_out="team_rating",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# With scaled weights (1.0, 0.5), T1 team rating = (1100*1.0 + 900*0.5) / (1.0+0.5) = 1450/1.5 = 966.67
|
|
62
|
+
# If it used raw weights (20.0, 5.0), it would be (1100*20 + 900*5) / 25 = 26500/25 = 1060
|
|
63
|
+
t1_rating = result.filter(pl.col("tid") == "T1")["team_rating"][0]
|
|
64
|
+
|
|
65
|
+
expected_with_scaled = (1100.0 * 1.0 + 900.0 * 0.5) / (1.0 + 0.5)
|
|
66
|
+
wrong_with_raw = (1100.0 * 20.0 + 900.0 * 5.0) / (20.0 + 5.0)
|
|
67
|
+
|
|
68
|
+
assert t1_rating == pytest.approx(expected_with_scaled, rel=1e-6)
|
|
69
|
+
assert t1_rating != pytest.approx(wrong_with_raw, rel=1e-6)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_add_team_rating_projected_falls_back_to_raw(column_names, df_without_scaled):
|
|
73
|
+
"""add_team_rating_projected should use raw ppw when _SCALED_PPW is not available."""
|
|
74
|
+
result = add_team_rating_projected(
|
|
75
|
+
df=df_without_scaled,
|
|
76
|
+
column_names=column_names,
|
|
77
|
+
player_rating_col="rating",
|
|
78
|
+
team_rating_out="team_rating",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# With raw weights (0.8, 0.4), T1 team rating = (1100*0.8 + 900*0.4) / (0.8+0.4) = 1240/1.2 = 1033.33
|
|
82
|
+
t1_rating = result.filter(pl.col("tid") == "T1")["team_rating"][0]
|
|
83
|
+
|
|
84
|
+
expected = (1100.0 * 0.8 + 900.0 * 0.4) / (0.8 + 0.4)
|
|
85
|
+
assert t1_rating == pytest.approx(expected, rel=1e-6)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_add_rating_mean_projected_uses_scaled_column(column_names, df_with_scaled):
|
|
89
|
+
"""add_rating_mean_projected should use _SCALED_PPW when available."""
|
|
90
|
+
result = add_rating_mean_projected(
|
|
91
|
+
df=df_with_scaled,
|
|
92
|
+
column_names=column_names,
|
|
93
|
+
player_rating_col="rating",
|
|
94
|
+
rating_mean_out="mean_rating",
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# With scaled weights, mean = (1100*1.0 + 900*0.5 + 1050*1.0 + 950*1.0) / (1.0+0.5+1.0+1.0)
|
|
98
|
+
# = (1100 + 450 + 1050 + 950) / 3.5 = 3550/3.5 = 1014.29
|
|
99
|
+
mean_rating = result["mean_rating"][0]
|
|
100
|
+
|
|
101
|
+
expected_with_scaled = (1100.0*1.0 + 900.0*0.5 + 1050.0*1.0 + 950.0*1.0) / (1.0+0.5+1.0+1.0)
|
|
102
|
+
wrong_with_raw = (1100.0*20.0 + 900.0*5.0 + 1050.0*10.0 + 950.0*10.0) / (20.0+5.0+10.0+10.0)
|
|
103
|
+
|
|
104
|
+
assert mean_rating == pytest.approx(expected_with_scaled, rel=1e-6)
|
|
105
|
+
assert mean_rating != pytest.approx(wrong_with_raw, rel=1e-6)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def test_add_rating_mean_projected_falls_back_to_raw(column_names, df_without_scaled):
|
|
109
|
+
"""add_rating_mean_projected should use raw ppw when _SCALED_PPW is not available."""
|
|
110
|
+
result = add_rating_mean_projected(
|
|
111
|
+
df=df_without_scaled,
|
|
112
|
+
column_names=column_names,
|
|
113
|
+
player_rating_col="rating",
|
|
114
|
+
rating_mean_out="mean_rating",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# With raw weights (0.8, 0.4, 1.0, 1.0)
|
|
118
|
+
mean_rating = result["mean_rating"][0]
|
|
119
|
+
|
|
120
|
+
expected = (1100.0*0.8 + 900.0*0.4 + 1050.0*1.0 + 950.0*1.0) / (0.8+0.4+1.0+1.0)
|
|
121
|
+
assert mean_rating == pytest.approx(expected, rel=1e-6)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def test_scaled_weights_not_in_output(column_names, df_with_scaled):
|
|
125
|
+
"""Verify utility functions don't add scaled columns to output unnecessarily."""
|
|
126
|
+
result = add_team_rating_projected(
|
|
127
|
+
df=df_with_scaled,
|
|
128
|
+
column_names=column_names,
|
|
129
|
+
player_rating_col="rating",
|
|
130
|
+
team_rating_out="team_rating",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# The scaled column should still be present (it was in input)
|
|
134
|
+
# but no new internal columns should be added
|
|
135
|
+
assert _SCALED_PPW in result.columns
|
|
136
|
+
assert "team_rating" in result.columns
|
tests/scorer/test_score.py
CHANGED
|
@@ -2138,3 +2138,145 @@ def test_scorers_respect_validation_column(scorer_factory, df_factory):
|
|
|
2138
2138
|
score_all = scorer_factory().score(df)
|
|
2139
2139
|
score_valid = scorer_factory().score(df_valid)
|
|
2140
2140
|
assert score_all == score_valid
|
|
2141
|
+
|
|
2142
|
+
|
|
2143
|
+
# ============================================================================
|
|
2144
|
+
# PWMSE evaluation_labels Extension Tests
|
|
2145
|
+
# ============================================================================
|
|
2146
|
+
|
|
2147
|
+
|
|
2148
|
+
@pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
|
|
2149
|
+
def test_pwmse__evaluation_labels_extends_predictions(df_type):
|
|
2150
|
+
"""PWMSE with evaluation_labels as superset extends predictions with small probs."""
|
|
2151
|
+
df = create_dataframe(
|
|
2152
|
+
df_type,
|
|
2153
|
+
{
|
|
2154
|
+
"pred": [
|
|
2155
|
+
[0.3, 0.5, 0.2],
|
|
2156
|
+
[0.2, 0.6, 0.2],
|
|
2157
|
+
],
|
|
2158
|
+
"target": [0, 1],
|
|
2159
|
+
},
|
|
2160
|
+
)
|
|
2161
|
+
|
|
2162
|
+
scorer = PWMSE(
|
|
2163
|
+
pred_column="pred",
|
|
2164
|
+
target="target",
|
|
2165
|
+
labels=[0, 1, 2],
|
|
2166
|
+
evaluation_labels=[-1, 0, 1, 2, 3],
|
|
2167
|
+
)
|
|
2168
|
+
score = scorer.score(df)
|
|
2169
|
+
|
|
2170
|
+
n_eval_labels = 5
|
|
2171
|
+
eps = 1e-5
|
|
2172
|
+
preds_original = np.array([[0.3, 0.5, 0.2], [0.2, 0.6, 0.2]])
|
|
2173
|
+
extended = np.full((2, n_eval_labels), eps, dtype=np.float64)
|
|
2174
|
+
extended[:, 1] = preds_original[:, 0]
|
|
2175
|
+
extended[:, 2] = preds_original[:, 1]
|
|
2176
|
+
extended[:, 3] = preds_original[:, 2]
|
|
2177
|
+
row_sums = extended.sum(axis=1, keepdims=True)
|
|
2178
|
+
preds_renorm = extended / row_sums
|
|
2179
|
+
|
|
2180
|
+
eval_labels = np.array([-1, 0, 1, 2, 3], dtype=np.float64)
|
|
2181
|
+
targets = np.array([0, 1], dtype=np.float64)
|
|
2182
|
+
diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
|
|
2183
|
+
expected = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
|
|
2184
|
+
|
|
2185
|
+
assert abs(score - expected) < 1e-10
|
|
2186
|
+
|
|
2187
|
+
|
|
2188
|
+
@pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
|
|
2189
|
+
def test_pwmse__evaluation_labels_exact_match(df_type):
|
|
2190
|
+
"""PWMSE with evaluation_labels identical to labels (no-op)."""
|
|
2191
|
+
df = create_dataframe(
|
|
2192
|
+
df_type,
|
|
2193
|
+
{
|
|
2194
|
+
"pred": [
|
|
2195
|
+
[0.3, 0.5, 0.2],
|
|
2196
|
+
[0.2, 0.6, 0.2],
|
|
2197
|
+
],
|
|
2198
|
+
"target": [0, 1],
|
|
2199
|
+
},
|
|
2200
|
+
)
|
|
2201
|
+
|
|
2202
|
+
scorer_with_eval = PWMSE(
|
|
2203
|
+
pred_column="pred",
|
|
2204
|
+
target="target",
|
|
2205
|
+
labels=[0, 1, 2],
|
|
2206
|
+
evaluation_labels=[0, 1, 2],
|
|
2207
|
+
)
|
|
2208
|
+
scorer_without_eval = PWMSE(
|
|
2209
|
+
pred_column="pred",
|
|
2210
|
+
target="target",
|
|
2211
|
+
labels=[0, 1, 2],
|
|
2212
|
+
)
|
|
2213
|
+
|
|
2214
|
+
score_with = scorer_with_eval.score(df)
|
|
2215
|
+
score_without = scorer_without_eval.score(df)
|
|
2216
|
+
|
|
2217
|
+
assert abs(score_with - score_without) < 1e-10
|
|
2218
|
+
|
|
2219
|
+
|
|
2220
|
+
@pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
|
|
2221
|
+
def test_pwmse__evaluation_labels_partial_overlap_raises(df_type):
|
|
2222
|
+
"""PWMSE with partial overlap between labels and evaluation_labels raises."""
|
|
2223
|
+
with pytest.raises(ValueError, match="evaluation_labels must be a subset or superset"):
|
|
2224
|
+
PWMSE(
|
|
2225
|
+
pred_column="pred",
|
|
2226
|
+
target="target",
|
|
2227
|
+
labels=[0, 1, 2],
|
|
2228
|
+
evaluation_labels=[1, 2, 3],
|
|
2229
|
+
)
|
|
2230
|
+
|
|
2231
|
+
|
|
2232
|
+
@pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
|
|
2233
|
+
def test_pwmse__evaluation_labels_extends_with_compare_to_naive(df_type):
|
|
2234
|
+
"""PWMSE extension mode works correctly with compare_to_naive."""
|
|
2235
|
+
df = create_dataframe(
|
|
2236
|
+
df_type,
|
|
2237
|
+
{
|
|
2238
|
+
"pred": [
|
|
2239
|
+
[0.8, 0.15, 0.05],
|
|
2240
|
+
[0.1, 0.7, 0.2],
|
|
2241
|
+
[0.05, 0.15, 0.8],
|
|
2242
|
+
[0.3, 0.4, 0.3],
|
|
2243
|
+
],
|
|
2244
|
+
"target": [0, 1, 2, 1],
|
|
2245
|
+
},
|
|
2246
|
+
)
|
|
2247
|
+
|
|
2248
|
+
scorer = PWMSE(
|
|
2249
|
+
pred_column="pred",
|
|
2250
|
+
target="target",
|
|
2251
|
+
labels=[0, 1, 2],
|
|
2252
|
+
evaluation_labels=[-1, 0, 1, 2, 3],
|
|
2253
|
+
compare_to_naive=True,
|
|
2254
|
+
)
|
|
2255
|
+
score = scorer.score(df)
|
|
2256
|
+
|
|
2257
|
+
n_eval_labels = 5
|
|
2258
|
+
eps = 1e-5
|
|
2259
|
+
preds_original = np.array([
|
|
2260
|
+
[0.8, 0.15, 0.05],
|
|
2261
|
+
[0.1, 0.7, 0.2],
|
|
2262
|
+
[0.05, 0.15, 0.8],
|
|
2263
|
+
[0.3, 0.4, 0.3],
|
|
2264
|
+
])
|
|
2265
|
+
extended = np.full((4, n_eval_labels), eps, dtype=np.float64)
|
|
2266
|
+
extended[:, 1] = preds_original[:, 0]
|
|
2267
|
+
extended[:, 2] = preds_original[:, 1]
|
|
2268
|
+
extended[:, 3] = preds_original[:, 2]
|
|
2269
|
+
row_sums = extended.sum(axis=1, keepdims=True)
|
|
2270
|
+
preds_renorm = extended / row_sums
|
|
2271
|
+
|
|
2272
|
+
eval_labels = np.array([-1, 0, 1, 2, 3], dtype=np.float64)
|
|
2273
|
+
targets = np.array([0, 1, 2, 1], dtype=np.float64)
|
|
2274
|
+
diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
|
|
2275
|
+
model_score = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
|
|
2276
|
+
|
|
2277
|
+
naive_probs = np.array([0.0, 0.25, 0.5, 0.25, 0.0])
|
|
2278
|
+
naive_preds = np.tile(naive_probs, (4, 1))
|
|
2279
|
+
naive_score = float((diffs_sqd * naive_preds).sum(axis=1).mean())
|
|
2280
|
+
|
|
2281
|
+
expected = naive_score - model_score
|
|
2282
|
+
assert abs(score - expected) < 1e-10
|