spforge 0.8.2__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of spforge might be problematic. Click here for more details.

spforge/scorer/_score.py CHANGED
@@ -350,6 +350,7 @@ class PWMSE(BaseScorer):
350
350
  labels: list[int] | None = None,
351
351
  compare_to_naive: bool = False,
352
352
  naive_granularity: list[str] | None = None,
353
+ evaluation_labels: list[int] | None = None,
353
354
  ):
354
355
  self.pred_column_name = pred_column
355
356
  super().__init__(
@@ -363,12 +364,39 @@ class PWMSE(BaseScorer):
363
364
  naive_granularity=naive_granularity,
364
365
  )
365
366
  self.labels = labels
367
+ self.evaluation_labels = evaluation_labels
368
+
369
+ self._eval_indices: list[int] | None = None
370
+ if self.evaluation_labels is not None and self.labels is not None:
371
+ label_to_idx = {lbl: i for i, lbl in enumerate(self.labels)}
372
+ self._eval_indices = [label_to_idx[lbl] for lbl in self.evaluation_labels]
373
+
374
+ def _slice_and_renormalize(self, preds: np.ndarray) -> np.ndarray:
375
+ if self._eval_indices is None:
376
+ return preds
377
+ sliced = preds[:, self._eval_indices]
378
+ row_sums = sliced.sum(axis=1, keepdims=True)
379
+ row_sums = np.where(row_sums == 0, 1.0, row_sums)
380
+ return sliced / row_sums
381
+
382
+ def _get_scoring_labels(self) -> list[int]:
383
+ if self.evaluation_labels is not None:
384
+ return self.evaluation_labels
385
+ return self.labels
366
386
 
367
387
  def _pwmse_score(self, targets: np.ndarray, preds: np.ndarray) -> float:
368
- labels = np.asarray(self.labels, dtype=np.float64)
388
+ labels = np.asarray(self._get_scoring_labels(), dtype=np.float64)
369
389
  diffs_sqd = (labels[None, :] - targets[:, None]) ** 2
370
390
  return float((diffs_sqd * preds).sum(axis=1).mean())
371
391
 
392
+ def _filter_targets_for_evaluation(self, df: IntoFrameT) -> IntoFrameT:
393
+ if self.evaluation_labels is None:
394
+ return df
395
+ eval_set = set(self.evaluation_labels)
396
+ min_eval, max_eval = min(eval_set), max(eval_set)
397
+ target_col = nw.col(self.target)
398
+ return df.filter((target_col >= min_eval) & (target_col <= max_eval))
399
+
372
400
  @narwhals.narwhalify
373
401
  def score(self, df: IntoFrameT) -> float | dict[tuple, float]:
374
402
  df = apply_filters(df, self.filters)
@@ -386,6 +414,9 @@ class PWMSE(BaseScorer):
386
414
  after,
387
415
  )
388
416
 
417
+ # Filter targets outside evaluation_labels range
418
+ df = self._filter_targets_for_evaluation(df)
419
+
389
420
  if self.aggregation_level:
390
421
  first_pred = df[self.pred_column].to_list()[0] if len(df) > 0 else None
391
422
  if isinstance(first_pred, (list, np.ndarray)):
@@ -415,12 +446,13 @@ class PWMSE(BaseScorer):
415
446
 
416
447
  targets = gran_df[self.target].to_numpy().astype(np.float64)
417
448
  preds = np.asarray(gran_df[self.pred_column].to_list(), dtype=np.float64)
449
+ preds = self._slice_and_renormalize(preds)
418
450
  score = self._pwmse_score(targets, preds)
419
451
  if self.compare_to_naive:
420
452
  naive_probs_list = _naive_probability_predictions_for_df(
421
453
  gran_df,
422
454
  self.target,
423
- list(self.labels) if self.labels else None,
455
+ list(self._get_scoring_labels()) if self._get_scoring_labels() else None,
424
456
  self.naive_granularity,
425
457
  )
426
458
  naive_preds = np.asarray(naive_probs_list, dtype=np.float64)
@@ -432,12 +464,13 @@ class PWMSE(BaseScorer):
432
464
 
433
465
  targets = df[self.target].to_numpy().astype(np.float64)
434
466
  preds = np.asarray(df[self.pred_column].to_list(), dtype=np.float64)
467
+ preds = self._slice_and_renormalize(preds)
435
468
  score = self._pwmse_score(targets, preds)
436
469
  if self.compare_to_naive:
437
470
  naive_probs_list = _naive_probability_predictions_for_df(
438
471
  df,
439
472
  self.target,
440
- list(self.labels) if self.labels else None,
473
+ list(self._get_scoring_labels()) if self._get_scoring_labels() else None,
441
474
  self.naive_granularity,
442
475
  )
443
476
  naive_preds = np.asarray(naive_probs_list, dtype=np.float64)
@@ -1358,4 +1391,6 @@ class ThresholdEventScorer(BaseScorer):
1358
1391
  df, self.outcome_column, labels, self.naive_granularity
1359
1392
  )
1360
1393
  naive_score = self._score_with_probabilities(df, naive_list)
1394
+ if isinstance(score, dict) and isinstance(naive_score, dict):
1395
+ return {k: naive_score[k] - score[k] for k in score.keys()}
1361
1396
  return float(naive_score - score)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: spforge
3
- Version: 0.8.2
3
+ Version: 0.8.4
4
4
  Summary: A flexible framework for generating features, ratings, and building machine learning or other models for training and inference on sports data.
5
5
  Author-email: Mathias Holmstrøm <mathiasholmstom@gmail.com>
6
6
  License: See LICENSE file
@@ -61,7 +61,7 @@ spforge/ratings/team_performance_predictor.py,sha256=ThQOmYQUqKBB46ONYHOMM2arXFH
61
61
  spforge/ratings/team_start_rating_generator.py,sha256=ZJe84sTvE4Yep3d4wKJMMJn2Q4PhcCwkO7Wyd5nsYUA,5110
62
62
  spforge/ratings/utils.py,sha256=qms5J5SD-FyXDR2G8giDMbu_AoLgI135pjW4nghxROg,3940
63
63
  spforge/scorer/__init__.py,sha256=wj8PCvYIl6742Xwmt86c3oy6iqE8Ss-OpwHud6kd9IY,256
64
- spforge/scorer/_score.py,sha256=pzI-upJU4bwm33J5CGhV8bY8HquudnS--0Z6bhD4xew,54498
64
+ spforge/scorer/_score.py,sha256=TR0T9nJj0aeVgGfOE0fZmXlO66CELulYwxhi7ZAxhvY,56184
65
65
  spforge/transformers/__init__.py,sha256=IPCsMcsgBqG52d0ttATLCY4HvFCQZddExlLt74U-zuI,390
66
66
  spforge/transformers/_base.py,sha256=-smr_McQF9bYxM5-Agx6h7Xv_fhZzPfpAdQV-qK18bs,1134
67
67
  spforge/transformers/_net_over_predicted.py,sha256=5dC8pvA1DNO0yXPSgJSMGU8zAHi-maUELm7FqFQVo-U,2321
@@ -70,7 +70,7 @@ spforge/transformers/_other_transformer.py,sha256=xLfaFIhkFsigAoitB4x3F8An2j9ymd
70
70
  spforge/transformers/_predictor.py,sha256=2sE6gfVrilXzPVcBurSrtqHw33v2ljygQcEYXt9LhZc,3119
71
71
  spforge/transformers/_simple_transformer.py,sha256=zGUFNQYMeoDSa2CoQejQNiNmKCBN5amWTvyOchiUHj0,5660
72
72
  spforge/transformers/_team_ratio_predictor.py,sha256=g8_bR53Yyv0iNCtol1O9bgJSeZcIco_AfbQuUxQJkeY,6884
73
- spforge-0.8.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
73
+ spforge-0.8.4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
74
74
  tests/test_autopipeline.py,sha256=WXHeqBdjQD6xaXVkzvS8ocz0WVP9R7lN0PiHJ2iD8nA,16911
75
75
  tests/test_autopipeline_context.py,sha256=IuRUY4IA6uMObvbl2pXSaXO2_tl3qX6wEbTZY0dkTMI,1240
76
76
  tests/test_feature_generator_pipeline.py,sha256=CAgBknWqawqYi5_hxcPmpxrLVa5elMHVv1VrSVRKXEA,17705
@@ -93,7 +93,7 @@ tests/performance_transformers/test_performances_transformers.py,sha256=A-tGiCx7
93
93
  tests/ratings/test_player_rating_generator.py,sha256=3mjqlX159QqOlBoY3r_TFkvLwpE4zlLE0fiqpbfk3ps,58547
94
94
  tests/ratings/test_ratings_property.py,sha256=ckyfGILXa4tfQvsgyXEzBDNr2DUmHwFRV13N60w66iE,6561
95
95
  tests/ratings/test_team_rating_generator.py,sha256=cDnf1zHiYC7pkgydE3MYr8wSTJIq-bPfSqhIRI_4Tic,95357
96
- tests/scorer/test_score.py,sha256=5uVCZyEYsonrfDL5tY9sYSlyXIk3JJy5VPUP7zHpkqY,63163
96
+ tests/scorer/test_score.py,sha256=KTrGJypQEpU8tmgJ6LU8wK1SRC3PLUXFzZIyiA-UY7U,71749
97
97
  tests/scorer/test_score_aggregation_granularity.py,sha256=h-hyFOLzwp-92hYVU7CwvlRJ8jhB4DzXCtqgI-zcoqM,13677
98
98
  tests/transformers/test_estimator_transformer_context.py,sha256=5GOHbuWCWBMFwwOTJOuD4oNDsv-qDR0OxNZYGGuMdag,1819
99
99
  tests/transformers/test_net_over_predicted.py,sha256=vh7O1iRRPf4vcW9aLhOMAOyatfM5ZnLsQBKNAYsR3SU,3363
@@ -101,7 +101,7 @@ tests/transformers/test_other_transformer.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
101
101
  tests/transformers/test_predictor_transformer.py,sha256=N1aBYLjN3ldpYZLwjih_gTFYSMitrZu-PNK78W6RHaQ,6877
102
102
  tests/transformers/test_simple_transformer.py,sha256=wWR0qjLb_uS4HXrJgGdiqugOY1X7kwd1_OPS02IT2b8,4676
103
103
  tests/transformers/test_team_ratio_predictor.py,sha256=fOUP_JvNJi-3kom3ZOs1EdG0I6Z8hpLpYKNHu1eWtOw,8562
104
- spforge-0.8.2.dist-info/METADATA,sha256=xcw8LWeJSYUBQ01Owe9FiI8fNmJVrlRRb2lnBcXSOmo,20219
105
- spforge-0.8.2.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
106
- spforge-0.8.2.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
107
- spforge-0.8.2.dist-info/RECORD,,
104
+ spforge-0.8.4.dist-info/METADATA,sha256=XNaD0lL_puuuYmZU59VjenOYpLSRCSx_nswef8yCZ4M,20219
105
+ spforge-0.8.4.dist-info/WHEEL,sha256=qELbo2s1Yzl39ZmrAibXA2jjPLUYfnVhUNTlyF1rq0Y,92
106
+ spforge-0.8.4.dist-info/top_level.txt,sha256=6UW2M5a7WKOeaAi900qQmRKNj5-HZzE8-eUD9Y9LTq0,23
107
+ spforge-0.8.4.dist-info/RECORD,,
@@ -372,6 +372,136 @@ def test_pwmse_compare_to_naive_granularity(df_type):
372
372
  assert abs(score - expected) < 1e-10
373
373
 
374
374
 
375
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
376
+ def test_pwmse__evaluation_labels_slices_predictions(df_type):
377
+ """PWMSE with evaluation_labels should only score on specified labels."""
378
+ # Predictions have 5 labels: [-2, -1, 0, 1, 2]
379
+ # But we only want to evaluate on inner labels: [-1, 0, 1]
380
+ df = create_dataframe(
381
+ df_type,
382
+ {
383
+ "pred": [
384
+ [0.1, 0.2, 0.4, 0.2, 0.1], # Full distribution over 5 labels
385
+ [0.05, 0.15, 0.5, 0.2, 0.1],
386
+ ],
387
+ "target": [0, 1],
388
+ },
389
+ )
390
+
391
+ # Score with all labels
392
+ scorer_full = PWMSE(pred_column="pred", target="target", labels=[-2, -1, 0, 1, 2])
393
+ score_full = scorer_full.score(df)
394
+
395
+ # Score with evaluation_labels excluding boundaries
396
+ scorer_eval = PWMSE(
397
+ pred_column="pred",
398
+ target="target",
399
+ labels=[-2, -1, 0, 1, 2],
400
+ evaluation_labels=[-1, 0, 1],
401
+ )
402
+ score_eval = scorer_eval.score(df)
403
+
404
+ # Scores should be different because evaluation_labels excludes boundary penalties
405
+ assert score_full != score_eval
406
+
407
+ # Manual calculation for evaluation_labels case:
408
+ # Slice predictions to indices 1, 2, 3 (corresponding to labels -1, 0, 1)
409
+ # Then renormalize
410
+ preds_full = np.array([[0.1, 0.2, 0.4, 0.2, 0.1], [0.05, 0.15, 0.5, 0.2, 0.1]])
411
+ preds_sliced = preds_full[:, 1:4] # [-1, 0, 1]
412
+ preds_renorm = preds_sliced / preds_sliced.sum(axis=1, keepdims=True)
413
+
414
+ eval_labels = np.array([-1, 0, 1], dtype=np.float64)
415
+ targets = np.array([0, 1], dtype=np.float64)
416
+ diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
417
+ expected = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
418
+
419
+ assert abs(score_eval - expected) < 1e-10
420
+
421
+
422
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
423
+ def test_pwmse__evaluation_labels_with_compare_to_naive(df_type):
424
+ """PWMSE evaluation_labels should also affect naive baseline calculation."""
425
+ df = create_dataframe(
426
+ df_type,
427
+ {
428
+ "pred": [
429
+ [0.1, 0.2, 0.4, 0.2, 0.1],
430
+ [0.1, 0.2, 0.4, 0.2, 0.1],
431
+ [0.1, 0.2, 0.4, 0.2, 0.1],
432
+ [0.1, 0.2, 0.4, 0.2, 0.1],
433
+ ],
434
+ "target": [-1, 0, 0, 1], # Targets within evaluation range
435
+ },
436
+ )
437
+
438
+ scorer = PWMSE(
439
+ pred_column="pred",
440
+ target="target",
441
+ labels=[-2, -1, 0, 1, 2],
442
+ evaluation_labels=[-1, 0, 1],
443
+ compare_to_naive=True,
444
+ )
445
+ score = scorer.score(df)
446
+
447
+ # Naive should be computed using only evaluation_labels
448
+ # With targets [-1, 0, 0, 1], naive probs are [1/4, 2/4, 1/4] for labels [-1, 0, 1]
449
+ eval_labels = np.array([-1, 0, 1], dtype=np.float64)
450
+ targets = np.array([-1, 0, 0, 1], dtype=np.float64)
451
+
452
+ # Model predictions sliced and renormalized
453
+ preds_full = np.array([[0.1, 0.2, 0.4, 0.2, 0.1]] * 4)
454
+ preds_sliced = preds_full[:, 1:4]
455
+ preds_renorm = preds_sliced / preds_sliced.sum(axis=1, keepdims=True)
456
+
457
+ diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
458
+ model_score = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
459
+
460
+ # Naive predictions for evaluation_labels only
461
+ naive_probs = np.array([0.25, 0.5, 0.25]) # Based on target distribution
462
+ naive_preds = np.tile(naive_probs, (4, 1))
463
+ naive_score = float((diffs_sqd * naive_preds).sum(axis=1).mean())
464
+
465
+ expected = naive_score - model_score
466
+ assert abs(score - expected) < 1e-10
467
+
468
+
469
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
470
+ def test_pwmse__evaluation_labels_filters_targets_outside_range(df_type):
471
+ """PWMSE should filter out targets outside evaluation_labels range."""
472
+ df = create_dataframe(
473
+ df_type,
474
+ {
475
+ "pred": [
476
+ [0.1, 0.2, 0.4, 0.2, 0.1],
477
+ [0.1, 0.2, 0.4, 0.2, 0.1],
478
+ [0.1, 0.2, 0.4, 0.2, 0.1],
479
+ ],
480
+ "target": [-2, 0, 2], # -2 and 2 are outside evaluation range [-1, 0, 1]
481
+ },
482
+ )
483
+
484
+ scorer = PWMSE(
485
+ pred_column="pred",
486
+ target="target",
487
+ labels=[-2, -1, 0, 1, 2],
488
+ evaluation_labels=[-1, 0, 1],
489
+ )
490
+ score = scorer.score(df)
491
+
492
+ # Should only use the row with target=0
493
+ preds_full = np.array([[0.1, 0.2, 0.4, 0.2, 0.1]])
494
+ preds_sliced = preds_full[:, 1:4]
495
+ preds_renorm = preds_sliced / preds_sliced.sum(axis=1, keepdims=True)
496
+
497
+ eval_labels = np.array([-1, 0, 1], dtype=np.float64)
498
+ targets = np.array([0], dtype=np.float64)
499
+ diffs_sqd = (eval_labels[None, :] - targets[:, None]) ** 2
500
+ expected = float((diffs_sqd * preds_renorm).sum(axis=1).mean())
501
+
502
+ assert abs(score - expected) < 1e-10
503
+
504
+
375
505
  # ============================================================================
376
506
  # D. MeanBiasScorer Tests
377
507
  # ============================================================================
@@ -1762,6 +1892,129 @@ def test_pwmse__accepts_ndarray_predictions(df_type):
1762
1892
  assert score >= 0
1763
1893
 
1764
1894
 
1895
+ # ============================================================================
1896
+ # ThresholdEventScorer with granularity and compare_to_naive Tests
1897
+ # ============================================================================
1898
+
1899
+
1900
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
1901
+ def test_threshold_event_scorer__granularity_with_compare_to_naive(df_type):
1902
+ """ThresholdEventScorer fails when combining compare_to_naive with granularity.
1903
+
1904
+ Bug: When granularity is set, binary_scorer.score() returns a dict, but
1905
+ the naive comparison tries to do dict - dict which fails with:
1906
+ 'unsupported operand type(s) for -: 'dict' and 'dict''
1907
+ """
1908
+ df = create_dataframe(
1909
+ df_type,
1910
+ {
1911
+ "qtr": [1, 1, 1, 2, 2, 2],
1912
+ "dist": [
1913
+ [0.1, 0.2, 0.3, 0.4],
1914
+ [0.2, 0.3, 0.3, 0.2],
1915
+ [0.3, 0.4, 0.2, 0.1],
1916
+ [0.4, 0.3, 0.2, 0.1],
1917
+ [0.1, 0.1, 0.4, 0.4],
1918
+ [0.2, 0.2, 0.3, 0.3],
1919
+ ],
1920
+ "ydstogo": [2.0, 3.0, 1.0, 2.0, 1.0, 3.0],
1921
+ "rush_yards": [3, 2, 0, 1, 2, 4],
1922
+ },
1923
+ )
1924
+
1925
+ scorer = ThresholdEventScorer(
1926
+ dist_column="dist",
1927
+ threshold_column="ydstogo",
1928
+ outcome_column="rush_yards",
1929
+ labels=[0, 1, 2, 3],
1930
+ compare_to_naive=True,
1931
+ granularity=["qtr"],
1932
+ )
1933
+
1934
+ result = scorer.score(df)
1935
+
1936
+ assert isinstance(result, dict)
1937
+ assert len(result) == 2
1938
+ assert (1,) in result
1939
+ assert (2,) in result
1940
+ assert all(isinstance(v, float) for v in result.values())
1941
+
1942
+
1943
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
1944
+ def test_threshold_event_scorer__granularity_with_compare_to_naive_and_naive_granularity(df_type):
1945
+ """ThresholdEventScorer with both granularity and naive_granularity."""
1946
+ df = create_dataframe(
1947
+ df_type,
1948
+ {
1949
+ "qtr": [1, 1, 1, 2, 2, 2],
1950
+ "team": ["A", "A", "B", "A", "B", "B"],
1951
+ "dist": [
1952
+ [0.1, 0.2, 0.3, 0.4],
1953
+ [0.2, 0.3, 0.3, 0.2],
1954
+ [0.3, 0.4, 0.2, 0.1],
1955
+ [0.4, 0.3, 0.2, 0.1],
1956
+ [0.1, 0.1, 0.4, 0.4],
1957
+ [0.2, 0.2, 0.3, 0.3],
1958
+ ],
1959
+ "ydstogo": [2.0, 3.0, 1.0, 2.0, 1.0, 3.0],
1960
+ "rush_yards": [3, 2, 0, 1, 2, 4],
1961
+ },
1962
+ )
1963
+
1964
+ scorer = ThresholdEventScorer(
1965
+ dist_column="dist",
1966
+ threshold_column="ydstogo",
1967
+ outcome_column="rush_yards",
1968
+ labels=[0, 1, 2, 3],
1969
+ compare_to_naive=True,
1970
+ naive_granularity=["team"],
1971
+ granularity=["qtr"],
1972
+ )
1973
+
1974
+ result = scorer.score(df)
1975
+
1976
+ assert isinstance(result, dict)
1977
+ assert len(result) == 2
1978
+ assert (1,) in result
1979
+ assert (2,) in result
1980
+ assert all(isinstance(v, float) for v in result.values())
1981
+
1982
+
1983
+ @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
1984
+ def test_threshold_event_scorer__multi_column_granularity_with_compare_to_naive(df_type):
1985
+ """ThresholdEventScorer with multi-column granularity and compare_to_naive."""
1986
+ df = create_dataframe(
1987
+ df_type,
1988
+ {
1989
+ "qtr": [1, 1, 2, 2],
1990
+ "half": [1, 1, 2, 2],
1991
+ "dist": [
1992
+ [0.1, 0.2, 0.3, 0.4],
1993
+ [0.2, 0.3, 0.3, 0.2],
1994
+ [0.4, 0.3, 0.2, 0.1],
1995
+ [0.1, 0.1, 0.4, 0.4],
1996
+ ],
1997
+ "ydstogo": [2.0, 3.0, 2.0, 1.0],
1998
+ "rush_yards": [3, 2, 1, 2],
1999
+ },
2000
+ )
2001
+
2002
+ scorer = ThresholdEventScorer(
2003
+ dist_column="dist",
2004
+ threshold_column="ydstogo",
2005
+ outcome_column="rush_yards",
2006
+ labels=[0, 1, 2, 3],
2007
+ compare_to_naive=True,
2008
+ granularity=["qtr", "half"],
2009
+ )
2010
+
2011
+ result = scorer.score(df)
2012
+
2013
+ assert isinstance(result, dict)
2014
+ assert len(result) == 2
2015
+ assert all(isinstance(v, float) for v in result.values())
2016
+
2017
+
1765
2018
  @pytest.mark.parametrize("df_type", [pl.DataFrame, pd.DataFrame])
1766
2019
  def test_all_scorers_handle_all_nan_targets(df_type):
1767
2020
  """All scorers handle case where all targets are NaN"""