numerai-tools 0.4.2.dev1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.3
2
+ Name: numerai-tools
3
+ Version: 0.5.0
4
+ Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
+ License: MIT
6
+ Author: Numerai Engineering
7
+ Author-email: engineering@numer.ai
8
+ Requires-Python: >=3.11
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
18
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
19
+ Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
20
+ Requires-Dist: scipy (>=1.13.0,<2.0.0)
21
+ Project-URL: Documentation, https://docs.numer.ai/
22
+ Project-URL: Homepage, https://numer.ai
23
+ Project-URL: Repository, https://github.com/numerai/numerai-tools
24
+ Description-Content-Type: text/markdown
25
+
26
+ # numerai-tools
27
+ A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
28
+
29
+ ## Installation
30
+ ```
31
+ pip install numerai-tools
32
+ ```
33
+
34
+ ## Structure
35
+
36
+ - The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
37
+
38
+ - The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
39
+
40
+ - The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
@@ -0,0 +1,15 @@
1
+ # numerai-tools
2
+ A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
3
+
4
+ ## Installation
5
+ ```
6
+ pip install numerai-tools
7
+ ```
8
+
9
+ ## Structure
10
+
11
+ - The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
12
+
13
+ - The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
14
+
15
+ - The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
@@ -1,8 +1,8 @@
1
- from typing import List, Tuple, Union, Optional, TypeVar
1
+ from typing import List, Literal, Tuple, Union, Optional, TypeVar, cast, Any
2
2
 
3
3
  import numpy as np
4
- import pandas as pd # type: ignore
5
- from scipy import stats # type: ignore
4
+ import pandas as pd
5
+ from scipy import stats
6
6
  from sklearn.preprocessing import OneHotEncoder # type: ignore
7
7
 
8
8
 
@@ -14,6 +14,7 @@ DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
14
14
 
15
15
  S1 = TypeVar("S1", bound=Union[pd.DataFrame, pd.Series])
16
16
  S2 = TypeVar("S2", bound=Union[pd.DataFrame, pd.Series])
17
+ RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
17
18
 
18
19
 
19
20
  def filter_sort_index(
@@ -43,12 +44,13 @@ def filter_sort_index(
43
44
  "s2 does not have enough overlapping ids with s1,"
44
45
  f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
45
46
  )
46
- return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
47
+ return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
47
48
 
48
49
 
49
50
  def filter_sort_index_many(
50
- inputs: List[S1], max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
51
- ) -> List[S1]:
51
+ inputs: List[Any],
52
+ max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
53
+ ) -> List[Any]:
52
54
  """Filters the indices of the given list of series to match each other,
53
55
  then sorts the indices, then checks that we didn't filter too many indices
54
56
  before returning the filtered and sorted series.
@@ -74,43 +76,72 @@ def filter_sort_index_many(
74
76
 
75
77
 
76
78
  def filter_sort_top_bottom(
77
- s: pd.Series, top_bottom: int, return_concatenated: bool = True
78
- ) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
79
+ s: pd.Series, top_bottom: int
80
+ ) -> Tuple[pd.Series, pd.Series]:
79
81
  """Filters the series according to the top n and bottom n values
80
- then sorts the index and returns the filtered and sorted series.
82
+ then sorts the index and returns two filtered and sorted series
83
+ for the top and bottom values respectively.
81
84
 
82
85
  Arguments:
83
86
  s: pd.Series - the data to filter and sort
84
87
  top_bottom: int - the number of top n and bottom n values to keep
85
88
 
86
89
  Returns:
87
- pd.Series - the filtered and sorted data
90
+ Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
88
91
  """
89
92
  tb_idx = np.argsort(s, kind="stable")
90
93
  bot = s.iloc[tb_idx[:top_bottom]]
91
94
  top = s.iloc[tb_idx[-top_bottom:]]
92
- if return_concatenated:
93
- return pd.concat([top, bot]).sort_index()
94
- else:
95
- return top.sort_index(), bot.sort_index()
95
+ return top.sort_index(), bot.sort_index()
96
96
 
97
97
 
98
- def rank(df: pd.DataFrame, method: str = "average") -> pd.DataFrame:
99
- """Percentile rank each column of a pandas DataFrame, centering values around 0.5
98
+ def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
99
+ """Similar to filter_sort_top_bottom, but concatenates the top and bottom series
100
+ into 1 series and then sorts the index.
100
101
 
101
102
  Arguments:
102
- df: pd.DataFrame - the data to rank
103
+ s: pd.Series - the data to filter and sort
104
+ top_bottom: int - the number of top n and bottom n values to keep
105
+
106
+ Returns:
107
+ pd.Series - the concatenated and sorted series of top and bottom values
108
+ """
109
+ top, bot = filter_sort_top_bottom(s, top_bottom)
110
+ return pd.concat([top, bot]).sort_index()
111
+
112
+
113
+ def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
114
+ """Percentile rank a pandas Series, centering values around 0.5.
115
+
116
+ Arguments:
117
+ s: pd.Series - the data to rank
103
118
  method: str - the pandas ranking method to use, options:
104
119
  'average' (default) - keeps ties
105
120
  'first' - breaks ties by index
106
121
 
107
122
  Returns:
108
- pd.DataFrame - the ranked DataFrame
123
+ pd.Series - the ranked Series
109
124
  """
110
- assert np.array_equal(df.index.sort_values(), df.index), "unsorted index found"
111
- return df.apply(
112
- lambda series: (series.rank(method=method).values - 0.5) / series.count()
113
- )
125
+ assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
126
+ return (s.rank(method=method) - 0.5) / s.count()
127
+
128
+
129
+ def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
130
+ """Percentile rank each columns or series, centering values around 0.5
131
+
132
+ Arguments:
133
+ s: pd.DataFrame | pd.Series - the data to rank
134
+ method: str - the pandas ranking method to use, options:
135
+ 'average' (default) - keeps ties
136
+ 'first' - breaks ties by index
137
+
138
+ Returns:
139
+ pd.DataFrame | pd.Series - the ranked input data
140
+ """
141
+ if isinstance(s, pd.Series):
142
+ return cast(S1, rank_series(s, method))
143
+ else:
144
+ return s.apply(lambda series: rank(series, method=method))
114
145
 
115
146
 
116
147
  def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
@@ -118,9 +149,9 @@ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
118
149
  return rank(df, "first")
119
150
 
120
151
 
121
- def tie_kept_rank(df: pd.DataFrame) -> pd.DataFrame:
152
+ def tie_kept_rank(s: S1) -> S1:
122
153
  """Rank columns, but keep ties."""
123
- return rank(df, "average")
154
+ return cast(S1, rank(s, "average"))
124
155
 
125
156
 
126
157
  def min_max_normalize(s: pd.Series) -> pd.Series:
@@ -133,14 +164,14 @@ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
133
164
  return df / np.std(df, axis=0)
134
165
 
135
166
 
136
- def weight_normalize(df: pd.DataFrame) -> pd.DataFrame:
137
- """Scale a df such that all columns have absolute value sum == 1."""
138
- return df / df.abs().sum(axis=0)
167
+ def weight_normalize(s: S1) -> S1:
168
+ """Scale a input such that all columns have absolute value sum == 1."""
169
+ return cast(S1, s / s.abs().sum(axis=0))
139
170
 
140
171
 
141
- def center(df: pd.DataFrame) -> pd.DataFrame:
142
- """Shift the df such that all columns have mean == 0."""
143
- return df - df.mean()
172
+ def center(s: S1) -> S1:
173
+ """Shift the input such that all columns have mean == 0."""
174
+ return cast(S1, s - s.mean())
144
175
 
145
176
 
146
177
  def standardize(df: pd.DataFrame) -> pd.DataFrame:
@@ -179,7 +210,7 @@ def pearson_correlation(
179
210
  target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
180
211
  ) -> float:
181
212
  if top_bottom is not None and top_bottom > 0:
182
- predictions = filter_sort_top_bottom(predictions, top_bottom)
213
+ predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
183
214
  target, predictions = filter_sort_index(
184
215
  target, predictions, (1 - top_bottom / len(target))
185
216
  )
@@ -205,7 +236,7 @@ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
205
236
  """
206
237
  assert not df.isna().any().any(), "Data contains NaNs"
207
238
  assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
208
- result = np.sign(df) * np.abs(df) ** p
239
+ result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
209
240
  assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
210
241
  return result
211
242
 
@@ -221,7 +252,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
221
252
  pd.DataFrame - the gaussianized data
222
253
  """
223
254
  assert np.array_equal(df.index.sort_values(), df.index)
224
- return df.apply(lambda series: stats.norm.ppf(series))
255
+ return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
225
256
 
226
257
 
227
258
  def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
@@ -303,7 +334,7 @@ def correlation_contribution(
303
334
  m = gaussian(tie_kept_rank(meta_model.to_frame()))[meta_model.name].values
304
335
 
305
336
  # orthogonalize predictions wrt meta model
306
- neutral_preds = orthogonalize(p, m)
337
+ neutral_preds = orthogonalize(p, cast(np.ndarray, m))
307
338
 
308
339
  # convert target to buckets [-2, -1, 0, 1, 2]
309
340
  if (live_targets >= 0).all() and (live_targets <= 1).all():
@@ -314,9 +345,9 @@ def correlation_contribution(
314
345
  # filter each column to its top and bottom n predictions
315
346
  neutral_preds_df = pd.DataFrame(
316
347
  neutral_preds, columns=predictions.columns, index=predictions.index
317
- ).apply(lambda p: filter_sort_top_bottom(p, top_bottom))
318
- # create a dataframe for targets to match the filtered predictions
319
- live_targets = (
348
+ ).apply(lambda p: filter_sort_top_bottom_concat(p, top_bottom))
349
+ mmc_matrix = (
350
+ # create a dataframe for targets to match the filtered predictions
320
351
  neutral_preds_df.apply(
321
352
  lambda p: filter_sort_index(
322
353
  p,
@@ -326,19 +357,15 @@ def correlation_contribution(
326
357
  )
327
358
  .fillna(0)
328
359
  .T.values
329
- )
330
- # fillna with 0 so we don't get NaNs in the dot product
331
- neutral_preds = neutral_preds_df.fillna(0).values
332
-
333
- # multiply target and neutralized predictions
334
- # this is equivalent to covariance b/c mean = 0
335
- mmc = live_targets @ neutral_preds
336
- if top_bottom is not None and top_bottom > 0:
360
+ # then fill NaNs with 0 so we don't get NaNs in the dot product
361
+ # and mutiply target w/ neutral preds to get MMC
362
+ ) @ neutral_preds_df.fillna(0).values
337
363
  # only the diagonal is the proper score
338
- mmc = np.diag(mmc) / (top_bottom * 2)
364
+ mmc = np.diag(mmc_matrix) / (top_bottom * 2)
339
365
  else:
340
- mmc /= len(live_targets)
341
-
366
+ # multiply target and neutralized predictions
367
+ # this is equivalent to covariance b/c mean = 0
368
+ mmc = (live_targets @ neutral_preds) / len(live_targets)
342
369
  return pd.Series(mmc, index=predictions.columns)
343
370
 
344
371
 
@@ -461,7 +488,7 @@ def numerai_corr(
461
488
  Returns:
462
489
  pd.Series - the resulting correlation scores for each column in predictions
463
490
  """
464
- targets = targets - targets.mean()
491
+ targets = center(targets)
465
492
  targets, predictions = filter_sort_index(
466
493
  targets, predictions, max_filtered_index_ratio
467
494
  )
@@ -522,21 +549,33 @@ def max_feature_correlation(
522
549
  feature_correlations = features.apply(
523
550
  lambda f: pearson_correlation(f, s, top_bottom)
524
551
  )
525
- feature_correlations = np.abs(feature_correlations)
552
+ feature_correlations = feature_correlations.abs()
526
553
  max_feature = feature_correlations.idxmax()
527
554
  max_corr = feature_correlations[max_feature]
528
- return max_feature, max_corr
555
+ return str(max_feature), max_corr
529
556
 
530
557
 
531
558
  def generate_neutralized_weights(
532
- predictions: pd.Series,
559
+ predictions: pd.DataFrame,
533
560
  neutralizers: pd.DataFrame,
534
561
  sample_weights: pd.Series,
535
- ) -> pd.Series:
536
- neutral_preds = predictions - (
537
- neutralizers @ (neutralizers.T @ (sample_weights * predictions))
562
+ center_and_normalize: bool = False,
563
+ ) -> pd.DataFrame:
564
+ assert not predictions.isna().any().any(), "Predictions contain NaNs"
565
+ assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
566
+ assert not sample_weights.isna().any(), "Weights contain NaNs"
567
+ ranked_predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
568
+ ranked_predictions, neutralizers, sample_weights = filter_sort_index_many(
569
+ [ranked_predictions, neutralizers, sample_weights]
570
+ )
571
+ neutral_weights = ranked_predictions.apply(
572
+ lambda s_prime: (
573
+ s_prime - neutralizers @ (neutralizers.T @ (sample_weights * s_prime))
574
+ )
575
+ * sample_weights
538
576
  )
539
- neutral_weights = neutral_preds * sample_weights
577
+ if center_and_normalize:
578
+ neutral_weights = weight_normalize(center(neutral_weights))
540
579
  return neutral_weights
541
580
 
542
581
 
@@ -557,18 +596,9 @@ def alpha(
557
596
  sample_weights: pd.Series - the universe sampling weights
558
597
  targets: pd.Series - the live targets to evaluate against
559
598
  """
560
- assert not predictions.isna().any().any(), "Predictions contain NaNs"
561
- assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
562
- assert not sample_weights.isna().any(), "Weights contain NaNs"
563
- predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
564
- [predictions, neutralizers, sample_weights, targets]
565
- )
566
-
567
- weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
568
- lambda s_prime: generate_neutralized_weights(
569
- s_prime, neutralizers, sample_weights
570
- )
571
- )
599
+ targets = center(targets)
600
+ predictions, targets = filter_sort_index(predictions, targets)
601
+ weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
572
602
  alpha_scores = weights.apply(lambda w: w @ targets) / len(targets)
573
603
  return alpha_scores
574
604
 
@@ -593,33 +623,22 @@ def meta_portfolio_contribution(
593
623
  sample_weights: pd.Series - the universe sampling weights
594
624
  targets: pd.Series - the live targets to evaluate against
595
625
  """
596
- assert not predictions.isna().any().any(), "Predictions contain NaNs"
597
- assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
598
- assert not sample_weights.isna().any(), "Weights contain NaNs"
599
- predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
600
- [predictions, neutralizers, sample_weights, targets]
601
- )
626
+ targets = center(targets)
627
+ predictions, targets = filter_sort_index(predictions, targets)
602
628
  stake_weights = weight_normalize(stakes.fillna(0))
603
629
  assert np.isclose(stake_weights.sum(), 1), "Stakes must sum to 1"
604
- weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
605
- lambda s_prime: generate_neutralized_weights(
606
- s_prime, neutralizers, sample_weights
607
- )
608
- )
609
- w = weights[stakes.index].values
610
- s = stake_weights.values
611
- t = targets.values
630
+ weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
631
+ w = cast(np.ndarray, weights[stakes.index].values)
632
+ s = cast(np.ndarray, stake_weights.values)
633
+ t = cast(np.ndarray, targets.values)
612
634
  swp = w @ s
613
635
  swp = swp - swp.mean()
614
- swp_abs_sum = np.sum(np.abs(swp))
636
+ l1_norm = np.sum(np.abs(swp))
637
+ l1_norm_squared = np.power(l1_norm, 2)
615
638
  swp_sign = np.sign(swp)
616
- alpha_unnormalized_swp_grad = (
617
- 1
618
- / np.power(swp_abs_sum, 2)
619
- * (swp_abs_sum * t - swp_sign * np.dot(swp, t)).reshape(-1, 1)
620
- )
621
- zero_mean_jac_vec_prod = (
622
- alpha_unnormalized_swp_grad - alpha_unnormalized_swp_grad.mean()
623
- )
624
- mpc = (w.T @ zero_mean_jac_vec_prod).squeeze()
639
+ swp_alpha = np.dot(swp, t)
640
+ directional_gradient = l1_norm * t - swp_sign * swp_alpha
641
+ jacobian_vector_product = directional_gradient.reshape(-1, 1) / l1_norm_squared
642
+ centered_jacobian = jacobian_vector_product - jacobian_vector_product.mean()
643
+ mpc = (w.T @ centered_jacobian).squeeze()
625
644
  return pd.Series(mpc, index=stakes.index)
@@ -0,0 +1,206 @@
1
+ from typing import Tuple, Optional
2
+
3
+ from numerai_tools.scoring import (
4
+ filter_sort_index,
5
+ filter_sort_top_bottom,
6
+ spearman_correlation,
7
+ generate_neutralized_weights,
8
+ )
9
+ from numerai_tools.submissions import (
10
+ validate_submission_signals,
11
+ clean_submission,
12
+ )
13
+
14
+ import pandas as pd
15
+
16
+
17
+ def churn(
18
+ s1: pd.Series,
19
+ s2: pd.Series,
20
+ top_bottom: Optional[int] = None,
21
+ ) -> float:
22
+ """Calculate the churn between two series. Churn is the proportion of elements
23
+ that are different between the two series.
24
+
25
+ For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
26
+ If top_bottom is provided, the churn is calculated as the average of the % of
27
+ tickers that stay in the top and bottom predictions. This is only relevant when
28
+ the series are rank signals and not portfolio weights.
29
+
30
+ Arguments:
31
+ s1: pd.Series - the first series to compare
32
+ s2: pd.Series - the second series to compare
33
+ top_bottom: Optional[int] - the number of top and bottom predictions to use
34
+ when calculating the correlation. Results in
35
+ 2*top_bottom predictions.
36
+
37
+ Returns:
38
+ float - the churn between the two series
39
+ """
40
+ if top_bottom is not None and top_bottom > 0:
41
+ s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom)
42
+ s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom)
43
+ top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
44
+ bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
45
+ avg_overlap = (top_overlap + bot_overlap) / 2
46
+ return 1 - avg_overlap
47
+
48
+ s1, s2 = filter_sort_index(s1, s2)
49
+ assert s1.std() > 0, "s1 must have non-zero standard deviation"
50
+ assert s2.std() > 0, "s2 must have non-zero standard deviation"
51
+ return 1 - spearman_correlation(s1, s2)
52
+
53
+
54
+ def turnover(
55
+ s1: pd.Series,
56
+ s2: pd.Series,
57
+ ):
58
+ """Calculate the turnover between two series. Turnover is the total change in weights between
59
+ the two series divided by 2.
60
+
61
+ For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
62
+ and calculate turnover as the absolute total difference between the two series divided by 2.
63
+ This is only relevant when the series are portfolio weights and not rank signals.
64
+
65
+ Arguments:
66
+ s1: pd.Series - the first series to compare
67
+ s2: pd.Series - the second series to compare
68
+
69
+ Returns:
70
+ float - the turnover between the two series
71
+ """
72
+ s1, s2 = filter_sort_index(s1, s2)
73
+ turnover = (s1 - s2).abs().sum() / 2
74
+ return turnover
75
+
76
+
77
+ def calculate_max_churn_and_turnover(
78
+ curr_sub: pd.Series,
79
+ curr_neutralizer: pd.DataFrame,
80
+ curr_sample_weight: pd.Series,
81
+ prev_subs: dict[str, pd.Series],
82
+ prev_neutralizers: dict[str, pd.DataFrame],
83
+ prev_sample_weights: dict[str, pd.Series],
84
+ ) -> Tuple[float, float]:
85
+ """Calculate the maximum churn and turnover of the current submission with respect to previous submissions.
86
+ This function iterates over previous submissions and calculates churn and turnover for each submission
87
+ against the current submission. It expects the following:
88
+
89
+ - all submissions, neutralizers, and sample weights are indexed on the same type of tickers/IDs
90
+ (e.g. all numerai_ticker, or all composite_figi, or all etc.)
91
+
92
+ - neutralizers and sample weights cover the full universe of their respective eras. This means you
93
+ should avoid removing rows from neutralizers or sample weights before passing them to this function.
94
+
95
+ In a live submission environment your submissions are joined on their respective full universes, ranked,
96
+ and then any NaNs are filled with 0.5 before calculating churn and turnover. So, if you provide filtered
97
+ neutralizers or sample weights, your locally calculated churn and turnover may not match the live value.
98
+
99
+ Arguments:
100
+ curr_sub: pd.Series - current-era submission indexed on tickers/ids
101
+
102
+ curr_neutralizer: pd.DataFrame
103
+ - current-era neutralizers indexed on the same type of tickers/ids.
104
+ We expect these to cover the full universe for the current era.
105
+
106
+ curr_sample_weight: pd.Series
107
+ - current-era sample weights indexed on the same type of tickers/ids.
108
+ We expect these to cover the full universe for the current era.
109
+
110
+ prev_subs: dict[str, pd.Series]
111
+ - a dictionary mapping datestamps to submissions, where each submission is a
112
+ Series indexed on the same type of tickers/ids as the current
113
+ submission. To calculate churn and turnover for a live submission,
114
+ use the most recent 5 submissions. For diagnostics, just provide the
115
+ last 1 era.
116
+
117
+ prev_neutralizers: dict[str, pd.DataFrame]
118
+ - a dictionary mapping datestamps to neutralizers DataFrames where each neutralizers
119
+ DataFrame is indexed on the same type of tickers/ids as the current submission.
120
+ We expect each of these to cover the full universe of their respective eras.
121
+
122
+ prev_sample_weights: dict[str, pd.Series]
123
+ - a dictionary mapping datestamps to sample weights where each sample weights
124
+ Series is indexed on the same type of tickers/ids as the current submission.
125
+ We expect each of these to cover the full universe of their respective eras.
126
+
127
+ Returns:
128
+ prev_week_max_churn -- the maximum churn from previous submissions
129
+ prev_week_max_turnover -- the maximum turnover from previous submissions
130
+ """
131
+ (
132
+ curr_ticker_col,
133
+ curr_signal_col,
134
+ _,
135
+ curr_sub_df,
136
+ _,
137
+ ) = validate_submission_signals(
138
+ universe=curr_sample_weight.index.to_frame(),
139
+ submission=curr_sub.reset_index(),
140
+ )
141
+ curr_sub = clean_submission(
142
+ universe=curr_sample_weight.index.to_frame(),
143
+ submission=curr_sub_df,
144
+ src_id_col=curr_ticker_col,
145
+ src_signal_col=curr_signal_col,
146
+ rank_and_fill=True,
147
+ )
148
+ churn_stats = []
149
+ turnover_stats = []
150
+ neutralized_weights = generate_neutralized_weights(
151
+ curr_sub.to_frame(),
152
+ curr_neutralizer,
153
+ curr_sample_weight,
154
+ center_and_normalize=True,
155
+ )[curr_sub.name]
156
+ for datestamp in prev_subs:
157
+ prev_sub = prev_subs[datestamp]
158
+ prev_neutralizer = prev_neutralizers[datestamp]
159
+ prev_sample_weight = prev_sample_weights[datestamp]
160
+ (
161
+ prev_ticker_col,
162
+ prev_signal_col,
163
+ _,
164
+ prev_sub_df,
165
+ _,
166
+ ) = validate_submission_signals(
167
+ universe=prev_sample_weight.index.to_frame(),
168
+ submission=prev_sub.reset_index(),
169
+ )
170
+ prev_sub = clean_submission(
171
+ universe=prev_sample_weight.index.to_frame(),
172
+ submission=prev_sub_df,
173
+ src_id_col=prev_ticker_col,
174
+ src_signal_col=prev_signal_col,
175
+ dst_id_col=curr_ticker_col,
176
+ dst_signal_col=curr_signal_col,
177
+ rank_and_fill=True,
178
+ )
179
+ prev_neutralized_weights = generate_neutralized_weights(
180
+ prev_sub.to_frame(),
181
+ prev_neutralizer,
182
+ prev_sample_weight,
183
+ center_and_normalize=True,
184
+ )[prev_sub.name]
185
+ try:
186
+ churn_val = abs(churn(curr_sub, prev_sub))
187
+ except AssertionError as e:
188
+ if "does not have enough overlapping ids" in str(e):
189
+ continue
190
+ try:
191
+ turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
192
+ except AssertionError as e:
193
+ if "does not have enough overlapping ids" in str(e):
194
+ continue
195
+
196
+ churn_stats.append(churn_val)
197
+ turnover_stats.append(turnover_val)
198
+ if len(churn_stats) == 0:
199
+ prev_week_max_churn = 1.0
200
+ else:
201
+ prev_week_max_churn = max(churn_stats)
202
+ if len(turnover_stats) == 0:
203
+ prev_week_max_turnover = 1.0
204
+ else:
205
+ prev_week_max_turnover = max(turnover_stats)
206
+ return prev_week_max_churn, prev_week_max_turnover