numerai-tools 0.4.2.dev1__tar.gz → 0.5.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/PKG-INFO +1 -1
  2. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/scoring.py +15 -15
  3. numerai_tools-0.5.0.dev0/numerai_tools/signals.py +215 -0
  4. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/submissions.py +23 -4
  5. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/PKG-INFO +1 -1
  6. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/setup.py +1 -1
  7. numerai_tools-0.5.0.dev0/tests/test_signals.py +139 -0
  8. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/tests/test_submissions.py +12 -0
  9. numerai_tools-0.4.2.dev1/numerai_tools/signals.py +0 -72
  10. numerai_tools-0.4.2.dev1/tests/test_signals.py +0 -51
  11. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/LICENSE +0 -0
  12. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/README.md +0 -0
  13. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/__init__.py +0 -0
  14. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/py.typed +0 -0
  15. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/SOURCES.txt +0 -0
  16. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/dependency_links.txt +0 -0
  17. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/requires.txt +0 -0
  18. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/top_level.txt +0 -0
  19. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/setup.cfg +0 -0
  20. {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/tests/test_scoring.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numerai_tools
3
- Version: 0.4.2.dev1
3
+ Version: 0.5.0.dev0
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  Home-page: https://github.com/numerai/numerai-tools
6
6
  Maintainer: Numerai
@@ -47,8 +47,9 @@ def filter_sort_index(
47
47
 
48
48
 
49
49
  def filter_sort_index_many(
50
- inputs: List[S1], max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
51
- ) -> List[S1]:
50
+ inputs: List[pd.DataFrame],
51
+ max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
52
+ ) -> List[pd.DataFrame]:
52
53
  """Filters the indices of the given list of series to match each other,
53
54
  then sorts the indices, then checks that we didn't filter too many indices
54
55
  before returning the filtered and sorted series.
@@ -461,7 +462,7 @@ def numerai_corr(
461
462
  Returns:
462
463
  pd.Series - the resulting correlation scores for each column in predictions
463
464
  """
464
- targets = targets - targets.mean()
465
+ targets = center(targets)
465
466
  targets, predictions = filter_sort_index(
466
467
  targets, predictions, max_filtered_index_ratio
467
468
  )
@@ -557,14 +558,15 @@ def alpha(
557
558
  sample_weights: pd.Series - the universe sampling weights
558
559
  targets: pd.Series - the live targets to evaluate against
559
560
  """
561
+ targets = center(targets)
560
562
  assert not predictions.isna().any().any(), "Predictions contain NaNs"
561
563
  assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
562
564
  assert not sample_weights.isna().any(), "Weights contain NaNs"
563
565
  predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
564
566
  [predictions, neutralizers, sample_weights, targets]
565
567
  )
566
-
567
- weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
568
+ ranked_preds = tie_kept_rank__gaussianize__pow_1_5(predictions)
569
+ weights = ranked_preds.apply(
568
570
  lambda s_prime: generate_neutralized_weights(
569
571
  s_prime, neutralizers, sample_weights
570
572
  )
@@ -593,6 +595,7 @@ def meta_portfolio_contribution(
593
595
  sample_weights: pd.Series - the universe sampling weights
594
596
  targets: pd.Series - the live targets to evaluate against
595
597
  """
598
+ targets = center(targets)
596
599
  assert not predictions.isna().any().any(), "Predictions contain NaNs"
597
600
  assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
598
601
  assert not sample_weights.isna().any(), "Weights contain NaNs"
@@ -611,15 +614,12 @@ def meta_portfolio_contribution(
611
614
  t = targets.values
612
615
  swp = w @ s
613
616
  swp = swp - swp.mean()
614
- swp_abs_sum = np.sum(np.abs(swp))
617
+ l1_norm = np.sum(np.abs(swp))
618
+ l1_norm_squared = np.power(l1_norm, 2)
615
619
  swp_sign = np.sign(swp)
616
- alpha_unnormalized_swp_grad = (
617
- 1
618
- / np.power(swp_abs_sum, 2)
619
- * (swp_abs_sum * t - swp_sign * np.dot(swp, t)).reshape(-1, 1)
620
- )
621
- zero_mean_jac_vec_prod = (
622
- alpha_unnormalized_swp_grad - alpha_unnormalized_swp_grad.mean()
623
- )
624
- mpc = (w.T @ zero_mean_jac_vec_prod).squeeze()
620
+ swp_alpha = np.dot(swp, t)
621
+ directional_gradient = l1_norm * t - swp_sign * swp_alpha
622
+ jacobian_vector_product = directional_gradient.reshape(-1, 1) / l1_norm_squared
623
+ centered_jacobian = jacobian_vector_product - jacobian_vector_product.mean()
624
+ mpc = (w.T @ centered_jacobian).squeeze()
625
625
  return pd.Series(mpc, index=stakes.index)
@@ -0,0 +1,215 @@
1
+ from typing import Tuple, Optional
2
+
3
+ from numerai_tools.submissions import validate_headers_signals, validate_ids_signals
4
+ from numerai_tools.scoring import (
5
+ filter_sort_index,
6
+ filter_sort_top_bottom,
7
+ spearman_correlation,
8
+ tie_kept_rank,
9
+ tie_kept_rank__gaussianize__pow_1_5,
10
+ filter_sort_index_many,
11
+ generate_neutralized_weights,
12
+ weight_normalize,
13
+ center,
14
+ )
15
+
16
+ import pandas as pd
17
+
18
+
19
+ def churn(
20
+ s1: pd.Series,
21
+ s2: pd.Series,
22
+ top_bottom: Optional[int] = None,
23
+ ) -> float:
24
+ """Calculate the churn between two series. Churn is the proportion of elements
25
+ that are different between the two series.
26
+
27
+ For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
28
+ If top_bottom is provided, the churn is calculated as the average of the % of
29
+ tickers that stay in the top and bottom predictions. This is only relevant when
30
+ the series are rank signals and not portfolio weights.
31
+
32
+ Arguments:
33
+ s1: pd.Series - the first series to compare
34
+ s2: pd.Series - the second series to compare
35
+ top_bottom: Optional[int] - the number of top and bottom predictions to use
36
+ when calculating the correlation. Results in
37
+ 2*top_bottom predictions.
38
+
39
+ Returns:
40
+ float - the churn between the two series
41
+ """
42
+ if top_bottom is not None and top_bottom > 0:
43
+ s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
44
+ s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
45
+ top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
46
+ bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
47
+ avg_overlap = (top_overlap + bot_overlap) / 2
48
+ return 1 - avg_overlap
49
+
50
+ s1, s2 = filter_sort_index(s1, s2)
51
+ assert s1.std() > 0, "s1 must have non-zero standard deviation"
52
+ assert s2.std() > 0, "s2 must have non-zero standard deviation"
53
+ return 1 - spearman_correlation(s1, s2)
54
+
55
+
56
+ def turnover(
57
+ s1: pd.Series,
58
+ s2: pd.Series,
59
+ ):
60
+ """Calculate the turnover between two series. Turnover is the total change in weights between
61
+ the two series divided by 2.
62
+
63
+ For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
64
+ and calculate turnover as the absolute total difference between the two series divided by 2.
65
+ This is only relevant when the series are portfolio weights and not rank signals.
66
+
67
+ Arguments:
68
+ s1: pd.Series - the first series to compare
69
+ s2: pd.Series - the second series to compare
70
+ top_bottom: Optional[int] - the number of top and bottom predictions to use
71
+ when calculating the correlation. Results in
72
+ 2*top_bottom predictions.
73
+
74
+ Returns:
75
+ float - the turnover between the two series
76
+ """
77
+ s1, s2 = filter_sort_index(s1, s2)
78
+ turnover = (s1 - s2).abs().sum() / 2
79
+ return turnover
80
+
81
+
82
+ def neutral_weight(
83
+ submission: pd.Series,
84
+ signal_col: str,
85
+ neutralizer: pd.DataFrame,
86
+ weight: pd.Series,
87
+ ) -> pd.Series:
88
+ s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())[signal_col]
89
+ s_prime, neutralizer, weight = filter_sort_index_many( # type: ignore
90
+ [s_prime, neutralizer, weight]
91
+ )
92
+ neutral_weights = generate_neutralized_weights(s_prime, neutralizer, weight)
93
+ neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
94
+ return neutral_weights.sort_index()
95
+
96
+
97
+ def remap_ticker_col(
98
+ predictions: pd.DataFrame,
99
+ universe: pd.DataFrame,
100
+ ticker_col: str,
101
+ ) -> pd.DataFrame:
102
+ return (
103
+ predictions.join(universe, how="right")
104
+ .reset_index()
105
+ .set_index(ticker_col)
106
+ .sort_index()
107
+ )
108
+
109
+
110
+ def rank_and_fill_signal(
111
+ universe: pd.DataFrame,
112
+ submission: pd.Series,
113
+ signal_col: str,
114
+ ) -> pd.Series:
115
+ uni_joined_sub = universe.sort_index().join(
116
+ tie_kept_rank(submission.sort_index().to_frame())
117
+ )[[signal_col]]
118
+ filled_sub = uni_joined_sub.fillna(uni_joined_sub.median()).sort_index()
119
+ return filled_sub[signal_col]
120
+
121
+
122
+ def calculate_max_churn_and_turnover(
123
+ curr_sub: pd.DataFrame,
124
+ curr_neutralizer: pd.DataFrame,
125
+ curr_weight: pd.Series,
126
+ prev_week_subs: dict[str, pd.DataFrame],
127
+ prev_neutralizers: dict[str, pd.DataFrame],
128
+ prev_sample_weights: dict[str, pd.Series],
129
+ universe: pd.DataFrame,
130
+ curr_signal_col: str,
131
+ curr_ticker_col: str,
132
+ ) -> Tuple[float, float]:
133
+ """Calculate the maximum churn and turnover with respect to previous submissions.
134
+
135
+ Arguments:
136
+ curr_sub -- the current submission
137
+ curr_neutralizer -- the neutralizer DataFrame for the current submission
138
+ curr_weight -- the sample weights Series for the current submission
139
+ prev_week_subs -- a dictionary of datestamps to submissions
140
+ prev_neutralizers -- a dictionary of datestamps to neutralizers
141
+ prev_sample_weights -- a dictionary of datestamps to sample weights
142
+ universe -- the internal universe DataFrame
143
+ curr_signal_col -- the column name for signal in the current submission
144
+ curr_ticker_col -- the column name for tickers in the current submission
145
+
146
+ Returns:
147
+ prev_week_max_churn -- the maximum churn from previous submissions
148
+ prev_week_max_turnover -- the maximum turnover from previous submissions
149
+ """
150
+ curr_sub_vector: pd.Series = rank_and_fill_signal(
151
+ universe,
152
+ curr_sub.reset_index().set_index(curr_ticker_col).sort_index()[curr_signal_col],
153
+ curr_signal_col,
154
+ )
155
+ churn_stats = []
156
+ turnover_stats = []
157
+ neutralized_weights = neutral_weight(
158
+ curr_sub_vector, curr_signal_col, curr_neutralizer, curr_weight
159
+ )
160
+ for datestamp in prev_week_subs:
161
+ prev_sub = prev_week_subs[datestamp]
162
+ prev_neutralizer = prev_neutralizers[datestamp]
163
+ prev_weight = prev_sample_weights[datestamp]
164
+ prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub) # type: ignore
165
+ prev_universe = universe.reset_index().set_index(prev_ticker_col)
166
+ filtered_prev_sub_df, _ = validate_ids_signals(
167
+ prev_universe.index, prev_sub, prev_ticker_col
168
+ )
169
+ # in case the previous submission has a different ticker column,
170
+ # remap the ticker column of prev data to the current ticker column
171
+ filtered_prev_sub = remap_ticker_col(
172
+ filtered_prev_sub_df.set_index(prev_ticker_col),
173
+ universe=prev_universe,
174
+ ticker_col=curr_ticker_col,
175
+ )[curr_signal_col]
176
+ filtered_prev_sub = rank_and_fill_signal(
177
+ universe=universe,
178
+ submission=filtered_prev_sub,
179
+ signal_col=curr_signal_col,
180
+ )
181
+ prev_neutralizer = remap_ticker_col(
182
+ prev_neutralizer,
183
+ universe=prev_universe,
184
+ ticker_col=curr_ticker_col,
185
+ ).filter(like="neutralizer_")
186
+ prev_weight = remap_ticker_col(
187
+ prev_weight.to_frame(),
188
+ universe=prev_universe,
189
+ ticker_col=curr_ticker_col,
190
+ )[prev_weight.name]
191
+ prev_neutralized_weights = neutral_weight(
192
+ filtered_prev_sub, prev_signal_col, prev_neutralizer, prev_weight
193
+ )
194
+ try:
195
+ churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
196
+ except AssertionError as e:
197
+ if "does not have enough overlapping ids" in str(e):
198
+ continue
199
+ try:
200
+ turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
201
+ except AssertionError as e:
202
+ if "does not have enough overlapping ids" in str(e):
203
+ continue
204
+
205
+ churn_stats.append(churn_val)
206
+ turnover_stats.append(turnover_val)
207
+ if len(churn_stats) == 0:
208
+ prev_week_max_churn = 1.0
209
+ else:
210
+ prev_week_max_churn = max(churn_stats)
211
+ if len(turnover_stats) == 0:
212
+ prev_week_max_turnover = 1.0
213
+ else:
214
+ prev_week_max_turnover = max(turnover_stats)
215
+ return prev_week_max_churn, prev_week_max_turnover
@@ -1,5 +1,6 @@
1
1
  from numerai_tools.scoring import tie_kept_rank
2
2
 
3
+ import logging
3
4
  from typing import Tuple, List
4
5
 
5
6
  import pandas as pd
@@ -16,12 +17,15 @@ SIGNALS_ALLOWED_ID_COLS = [
16
17
  "numerai_ticker",
17
18
  ]
18
19
  SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
20
+ SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
19
21
  SIGNALS_MIN_TICKERS = 100
20
22
 
21
23
  CRYPTO_ALLOWED_ID_COLS = ["symbol"]
22
24
  CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
23
25
  CRYPTO_MIN_TICKERS = 100
24
26
 
27
+ logger = logging.getLogger(__name__)
28
+
25
29
 
26
30
  def _validate_headers(
27
31
  expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
@@ -58,6 +62,17 @@ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
58
62
 
59
63
 
60
64
  def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
65
+ if "data_type" in submission.columns:
66
+ logger.warning(
67
+ "data_type column found in Signals submission. This is deprecated and will be removed in the future. "
68
+ "Please remove the data_type column from your Signals submission."
69
+ )
70
+ date_col = [
71
+ date_col
72
+ for date_col in SIGNALS_ALLOWED_DATE_COLS
73
+ if date_col in list(submission.columns)
74
+ ]
75
+ submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
61
76
  return _validate_headers(
62
77
  SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
63
78
  )
@@ -155,6 +170,7 @@ def clean_predictions(
155
170
  predictions: pd.DataFrame,
156
171
  id_col: str,
157
172
  rank_and_fill: bool,
173
+ left_join_on_ids: bool = False,
158
174
  ) -> pd.Series:
159
175
  """Prepare predictions for submission to Numerai.
160
176
  Filters out ids not in live data, drops duplicates, sets ids as index,
@@ -169,6 +185,7 @@ def clean_predictions(
169
185
  predictions: pd.DataFrame - the predictions to clean
170
186
  id_col: str - the column name of the ids
171
187
  rank_and_fill: bool - whether to rank and fill NaNs with 0.5
188
+ left_join_ids: bool - whether to left join the predictions onto the ids
172
189
  """
173
190
  assert len(live_ids) > 0, "live_ids must not be empty"
174
191
  assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
@@ -177,13 +194,15 @@ def clean_predictions(
177
194
  # drop null indices
178
195
  predictions = predictions[~predictions[id_col].isna()]
179
196
  predictions = (
180
- predictions
181
- # filter out ids not in live data
182
- [predictions[id_col].isin(live_ids)]
197
+ predictions[
198
+ # filter out ids not in live data
199
+ predictions[id_col].isin(live_ids)
200
+ ]
183
201
  # drop duplicate ids (keep first)
184
202
  .drop_duplicates(subset=id_col, keep="first")
185
203
  # set ids as index
186
- .set_index(id_col).sort_index()
204
+ .set_index(id_col)
205
+ .sort_index()
187
206
  )
188
207
  # rank and fill with 0.5
189
208
  if rank_and_fill:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numerai-tools
3
- Version: 0.4.2.dev1
3
+ Version: 0.5.0.dev0
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  Home-page: https://github.com/numerai/numerai-tools
6
6
  Maintainer: Numerai
@@ -1,7 +1,7 @@
1
1
  from setuptools import setup
2
2
  from setuptools import find_packages
3
3
 
4
- VERSION = "0.4.2.dev1"
4
+ VERSION = "0.5.0.dev0"
5
5
 
6
6
 
7
7
  def load(path):
@@ -0,0 +1,139 @@
1
+ import unittest
2
+
3
+ import numpy as np
4
+ import pandas as pd # type: ignore
5
+
6
+ from numerai_tools.signals import (
7
+ churn,
8
+ turnover,
9
+ calculate_max_churn_and_turnover,
10
+ )
11
+ from .util import (
12
+ generate_fake_universe,
13
+ generate_new_submission,
14
+ )
15
+
16
+
17
+ class TestSignals(unittest.TestCase):
18
+ def setUp(self):
19
+ self.up = pd.Series(list(range(5))).rename("up")
20
+ self.down = pd.Series(list(reversed(range(5)))).rename("down")
21
+ self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
22
+ self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
23
+ self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
24
+
25
+ def test_churn(self):
26
+ assert np.isclose(churn(self.up, self.up), 0)
27
+ assert np.isclose(churn(self.up, self.up_down), 1)
28
+ assert np.isclose(churn(self.up, self.oscillate), 1)
29
+ assert np.isclose(churn(self.up, self.down), 2)
30
+ self.assertRaisesRegex(
31
+ AssertionError,
32
+ "s2 must have non-zero standard deviation",
33
+ churn,
34
+ self.up,
35
+ self.constant,
36
+ )
37
+
38
+ def test_churn_tb(self):
39
+ tmp = churn(self.up, self.up, top_bottom=2)
40
+ assert np.isclose(tmp, 0), tmp
41
+ tmp = churn(self.up, self.up_down, top_bottom=2)
42
+ assert np.isclose(tmp, 0.5), tmp
43
+ tmp = churn(self.up, self.oscillate, top_bottom=2)
44
+ assert np.isclose(tmp, 0.5), tmp
45
+ tmp = churn(self.up, self.down, top_bottom=2)
46
+ assert np.isclose(tmp, 1), tmp
47
+ tmp = churn(self.up, self.constant, top_bottom=2)
48
+ assert np.isclose(tmp, 0), tmp
49
+
50
+ def test_turnover(self):
51
+ assert np.isclose(turnover(self.up, self.up), 0)
52
+ assert np.isclose(turnover(self.up, self.up_down), 3)
53
+ assert np.isclose(turnover(self.up, self.oscillate), 4.5)
54
+ assert np.isclose(turnover(self.up, self.down), 6)
55
+ assert np.isclose(turnover(self.up, self.constant), 3.5)
56
+
57
+ def test_churn_first_submission(self):
58
+ """
59
+ Test that the churn function works for the first submission
60
+ No exceptions should be raised, should return 1
61
+ """
62
+ fake_universe = generate_fake_universe("20130308")
63
+ fake_submission = generate_new_submission(fake_universe)
64
+ fake_neutralizers = pd.DataFrame(
65
+ {
66
+ "neutralizer_1": [0.1] * len(fake_universe),
67
+ "neutralizer_2": [0.2] * len(fake_universe),
68
+ },
69
+ index=fake_universe["numerai_ticker"],
70
+ )
71
+ fake_sample_weights = pd.Series(
72
+ [0.5] * len(fake_universe),
73
+ index=fake_universe["numerai_ticker"],
74
+ name="sample_weight",
75
+ )
76
+ churn, turnover = calculate_max_churn_and_turnover(
77
+ curr_sub=fake_submission,
78
+ curr_neutralizer=fake_neutralizers,
79
+ curr_weight=fake_sample_weights,
80
+ prev_week_subs=[],
81
+ prev_neutralizers={"20240208": fake_neutralizers},
82
+ prev_sample_weights={"20240208": fake_sample_weights},
83
+ universe=fake_universe.set_index("numerai_ticker").sort_index(),
84
+ curr_signal_col="signal",
85
+ curr_ticker_col="numerai_ticker",
86
+ )
87
+ assert np.isclose(churn, 1)
88
+ assert np.isclose(turnover, 1)
89
+
90
+ def test_churn_handles_different_id_columns(self):
91
+ """
92
+ Test that the churn function works when
93
+ previous submission has different id columns.
94
+ """
95
+ fake_universe = generate_fake_universe("20130308")
96
+ fake_submission = generate_new_submission(fake_universe, legacy_headers=True)
97
+ new_fake_universe = generate_fake_universe(
98
+ date_value="20130308", ticker_col="ticker"
99
+ )
100
+ fake_universe["ticker"] = new_fake_universe["ticker"]
101
+ prev_submission = fake_submission.copy()
102
+ fake_neutralizers = pd.DataFrame(
103
+ {
104
+ "neutralizer_1": [0.1] * len(fake_universe),
105
+ "neutralizer_2": [0.2] * len(fake_universe),
106
+ },
107
+ index=fake_universe["numerai_ticker"],
108
+ )
109
+ fake_sample_weights = pd.Series(
110
+ [0.5] * len(fake_universe),
111
+ index=fake_universe["numerai_ticker"],
112
+ name="sample_weight",
113
+ )
114
+ # switch out the numerai_ticke col in-place
115
+ prev_submission["numerai_ticker"] = new_fake_universe["ticker"]
116
+ prev_submission.rename(columns={"numerai_ticker": "ticker"}, inplace=True)
117
+ prev_neutralizers = fake_neutralizers.copy()
118
+ prev_neutralizers.index = new_fake_universe["ticker"]
119
+ prev_neutralizers.index.name = "ticker"
120
+ prev_sample_weights = fake_sample_weights.copy()
121
+ prev_sample_weights.index = new_fake_universe["ticker"]
122
+ prev_sample_weights.index.name = "ticker"
123
+ churn, turnover = calculate_max_churn_and_turnover(
124
+ curr_sub=fake_submission,
125
+ curr_neutralizer=fake_neutralizers,
126
+ curr_weight=fake_sample_weights,
127
+ prev_week_subs={"20240208": prev_submission},
128
+ prev_neutralizers={"20240208": prev_neutralizers},
129
+ prev_sample_weights={"20240208": prev_sample_weights},
130
+ universe=fake_universe.set_index("numerai_ticker").sort_index(),
131
+ curr_signal_col="signal",
132
+ curr_ticker_col="numerai_ticker",
133
+ )
134
+ assert np.isclose(churn, 0)
135
+ assert np.isclose(turnover, 0)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ unittest.main()
@@ -155,6 +155,18 @@ class TestSubmissions(unittest.TestCase):
155
155
  sub[[sub.columns[1]]],
156
156
  )
157
157
 
158
+ def test_validate_headers_signals_data_type_and_date_col(self):
159
+ fake_sub = generate_submission(self.ids, "ticker", "signal")
160
+ fake_sub["data_type"] = "signals"
161
+ fake_sub["friday_date"] = "2023-01-01"
162
+ with self.assertLogs(level="WARNING") as cm:
163
+ assert validate_headers_signals(fake_sub) == ("ticker", "signal")
164
+ self.assertIn(
165
+ "WARNING:numerai_tools.submissions:data_type column found in Signals submission. This is deprecated and will be removed in the future. "
166
+ "Please remove the data_type column from your Signals submission.",
167
+ cm.output[0],
168
+ )
169
+
158
170
  def test_validate_headers_crypto(self):
159
171
  for sub in self.crypto_subs:
160
172
  assert validate_headers_crypto(sub) == tuple(sub.columns)
@@ -1,72 +0,0 @@
1
- from numerai_tools.scoring import (
2
- filter_sort_index,
3
- filter_sort_top_bottom,
4
- spearman_correlation,
5
- )
6
-
7
- from typing import List, Tuple, Union, Optional
8
-
9
- import pandas as pd
10
-
11
-
12
- def churn(
13
- s1: pd.Series,
14
- s2: pd.Series,
15
- top_bottom: Optional[int] = None,
16
- ) -> float:
17
- """Calculate the churn between two series. Churn is the proportion of elements
18
- that are different between the two series.
19
-
20
- For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
21
- If top_bottom is provided, the churn is calculated as the average of the % of
22
- tickers that stay in the top and bottom predictions. This is only relevant when
23
- the series are rank signals and not portfolio weights.
24
-
25
- Arguments:
26
- s1: pd.Series - the first series to compare
27
- s2: pd.Series - the second series to compare
28
- top_bottom: Optional[int] - the number of top and bottom predictions to use
29
- when calculating the correlation. Results in
30
- 2*top_bottom predictions.
31
-
32
- Returns:
33
- float - the churn between the two series
34
- """
35
- if top_bottom is not None and top_bottom > 0:
36
- s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
37
- s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
38
- top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
39
- bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
40
- avg_overlap = (top_overlap + bot_overlap) / 2
41
- return 1 - avg_overlap
42
-
43
- s1, s2 = filter_sort_index(s1, s2)
44
- assert s1.std() > 0, "s1 must have non-zero standard deviation"
45
- assert s2.std() > 0, "s2 must have non-zero standard deviation"
46
- return 1 - spearman_correlation(s1, s2)
47
-
48
-
49
- def turnover(
50
- s1: pd.Series,
51
- s2: pd.Series,
52
- ):
53
- """Calculate the turnover between two series. Turnover is the total change in weights between
54
- the two series divided by 2.
55
-
56
- For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
57
- and calculate turnover as the absolute total difference between the two series divided by 2.
58
- This is only relevant when the series are portfolio weights and not rank signals.
59
-
60
- Arguments:
61
- s1: pd.Series - the first series to compare
62
- s2: pd.Series - the second series to compare
63
- top_bottom: Optional[int] - the number of top and bottom predictions to use
64
- when calculating the correlation. Results in
65
- 2*top_bottom predictions.
66
-
67
- Returns:
68
- float - the turnover between the two series
69
- """
70
- s1, s2 = filter_sort_index(s1, s2)
71
- turnover = (s1 - s2).abs().sum() / 2
72
- return turnover
@@ -1,51 +0,0 @@
1
- import unittest
2
-
3
- import numpy as np
4
- import pandas as pd # type: ignore
5
-
6
- from numerai_tools.signals import churn, turnover
7
-
8
-
9
- class TestSignals(unittest.TestCase):
10
- def setUp(self):
11
- self.up = pd.Series(list(range(5))).rename("up")
12
- self.down = pd.Series(list(reversed(range(5)))).rename("down")
13
- self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
14
- self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
15
- self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
16
-
17
- def test_churn(self):
18
- assert np.isclose(churn(self.up, self.up), 0)
19
- assert np.isclose(churn(self.up, self.up_down), 1)
20
- assert np.isclose(churn(self.up, self.oscillate), 1)
21
- assert np.isclose(churn(self.up, self.down), 2)
22
- self.assertRaisesRegex(
23
- AssertionError,
24
- "s2 must have non-zero standard deviation",
25
- churn,
26
- self.up,
27
- self.constant,
28
- )
29
-
30
- def test_churn_tb(self):
31
- tmp = churn(self.up, self.up, top_bottom=2)
32
- assert np.isclose(tmp, 0), tmp
33
- tmp = churn(self.up, self.up_down, top_bottom=2)
34
- assert np.isclose(tmp, 0.5), tmp
35
- tmp = churn(self.up, self.oscillate, top_bottom=2)
36
- assert np.isclose(tmp, 0.5), tmp
37
- tmp = churn(self.up, self.down, top_bottom=2)
38
- assert np.isclose(tmp, 1), tmp
39
- tmp = churn(self.up, self.constant, top_bottom=2)
40
- assert np.isclose(tmp, 0), tmp
41
-
42
- def test_turnover(self):
43
- assert np.isclose(turnover(self.up, self.up), 0)
44
- assert np.isclose(turnover(self.up, self.up_down), 3)
45
- assert np.isclose(turnover(self.up, self.oscillate), 4.5)
46
- assert np.isclose(turnover(self.up, self.down), 6)
47
- assert np.isclose(turnover(self.up, self.constant), 3.5)
48
-
49
-
50
- if __name__ == "__main__":
51
- unittest.main()