numerai-tools 0.4.3__tar.gz → 0.5.0.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/PKG-INFO +1 -1
  2. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools/scoring.py +53 -43
  3. numerai_tools-0.5.0.dev1/numerai_tools/signals.py +217 -0
  4. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools/submissions.py +23 -5
  5. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools.egg-info/PKG-INFO +1 -1
  6. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/setup.py +1 -1
  7. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/tests/test_scoring.py +2 -2
  8. numerai_tools-0.5.0.dev1/tests/test_signals.py +139 -0
  9. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/tests/test_submissions.py +13 -1
  10. numerai_tools-0.4.3/numerai_tools/signals.py +0 -72
  11. numerai_tools-0.4.3/tests/test_signals.py +0 -51
  12. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/LICENSE +0 -0
  13. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/README.md +0 -0
  14. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools/__init__.py +0 -0
  15. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools/py.typed +0 -0
  16. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools.egg-info/SOURCES.txt +0 -0
  17. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools.egg-info/dependency_links.txt +0 -0
  18. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools.egg-info/requires.txt +0 -0
  19. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools.egg-info/top_level.txt +0 -0
  20. {numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numerai_tools
3
- Version: 0.4.3
3
+ Version: 0.5.0.dev1
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  Home-page: https://github.com/numerai/numerai-tools
6
6
  Maintainer: Numerai
@@ -1,8 +1,8 @@
1
- from typing import List, Tuple, Union, Optional, TypeVar
1
+ from typing import List, Tuple, Union, Optional, TypeVar, cast, Any
2
2
 
3
3
  import numpy as np
4
- import pandas as pd # type: ignore
5
- from scipy import stats # type: ignore
4
+ import pandas as pd
5
+ from scipy import stats
6
6
  from sklearn.preprocessing import OneHotEncoder # type: ignore
7
7
 
8
8
 
@@ -43,12 +43,13 @@ def filter_sort_index(
43
43
  "s2 does not have enough overlapping ids with s1,"
44
44
  f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
45
45
  )
46
- return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
46
+ return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
47
47
 
48
48
 
49
49
  def filter_sort_index_many(
50
- inputs: List[S1], max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
51
- ) -> List[S1]:
50
+ inputs: List[Any],
51
+ max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
52
+ ) -> List[Any]:
52
53
  """Filters the indices of the given list of series to match each other,
53
54
  then sorts the indices, then checks that we didn't filter too many indices
54
55
  before returning the filtered and sorted series.
@@ -74,25 +75,38 @@ def filter_sort_index_many(
74
75
 
75
76
 
76
77
  def filter_sort_top_bottom(
77
- s: pd.Series, top_bottom: int, return_concatenated: bool = True
78
- ) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
78
+ s: pd.Series, top_bottom: int
79
+ ) -> Tuple[pd.Series, pd.Series]:
79
80
  """Filters the series according to the top n and bottom n values
80
- then sorts the index and returns the filtered and sorted series.
81
+ then sorts the index and returns two filtered and sorted series
82
+ for the top and bottom values respectively.
81
83
 
82
84
  Arguments:
83
85
  s: pd.Series - the data to filter and sort
84
86
  top_bottom: int - the number of top n and bottom n values to keep
85
87
 
86
88
  Returns:
87
- pd.Series - the filtered and sorted data
89
+ Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
88
90
  """
89
91
  tb_idx = np.argsort(s, kind="stable")
90
92
  bot = s.iloc[tb_idx[:top_bottom]]
91
93
  top = s.iloc[tb_idx[-top_bottom:]]
92
- if return_concatenated:
93
- return pd.concat([top, bot]).sort_index()
94
- else:
95
- return top.sort_index(), bot.sort_index()
94
+ return top.sort_index(), bot.sort_index()
95
+
96
+
97
+ def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
98
+ """Similar to filter_sort_top_bottom, but concatenates the top and bottom series
99
+ into 1 series and then sorts the index.
100
+
101
+ Arguments:
102
+ s: pd.Series - the data to filter and sort
103
+ top_bottom: int - the number of top n and bottom n values to keep
104
+
105
+ Returns:
106
+ pd.Series - the concatenated and sorted series of top and bottom values
107
+ """
108
+ top, bot = filter_sort_top_bottom(s, top_bottom)
109
+ return pd.concat([top, bot]).sort_index()
96
110
 
97
111
 
98
112
  def rank(df: pd.DataFrame, method: str = "average") -> pd.DataFrame:
@@ -133,14 +147,14 @@ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
133
147
  return df / np.std(df, axis=0)
134
148
 
135
149
 
136
- def weight_normalize(df: pd.DataFrame) -> pd.DataFrame:
137
- """Scale a df such that all columns have absolute value sum == 1."""
138
- return df / df.abs().sum(axis=0)
150
+ def weight_normalize(s: S1) -> S1:
151
+ """Scale a input such that all columns have absolute value sum == 1."""
152
+ return cast(S1, s / s.abs().sum(axis=0))
139
153
 
140
154
 
141
- def center(df: pd.DataFrame) -> pd.DataFrame:
142
- """Shift the df such that all columns have mean == 0."""
143
- return df - df.mean()
155
+ def center(s: S1) -> S1:
156
+ """Shift the input such that all columns have mean == 0."""
157
+ return cast(S1, s - s.mean())
144
158
 
145
159
 
146
160
  def standardize(df: pd.DataFrame) -> pd.DataFrame:
@@ -179,7 +193,7 @@ def pearson_correlation(
179
193
  target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
180
194
  ) -> float:
181
195
  if top_bottom is not None and top_bottom > 0:
182
- predictions = filter_sort_top_bottom(predictions, top_bottom)
196
+ predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
183
197
  target, predictions = filter_sort_index(
184
198
  target, predictions, (1 - top_bottom / len(target))
185
199
  )
@@ -205,7 +219,7 @@ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
205
219
  """
206
220
  assert not df.isna().any().any(), "Data contains NaNs"
207
221
  assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
208
- result = np.sign(df) * np.abs(df) ** p
222
+ result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
209
223
  assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
210
224
  return result
211
225
 
@@ -221,7 +235,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
221
235
  pd.DataFrame - the gaussianized data
222
236
  """
223
237
  assert np.array_equal(df.index.sort_values(), df.index)
224
- return df.apply(lambda series: stats.norm.ppf(series))
238
+ return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
225
239
 
226
240
 
227
241
  def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
@@ -303,7 +317,7 @@ def correlation_contribution(
303
317
  m = gaussian(tie_kept_rank(meta_model.to_frame()))[meta_model.name].values
304
318
 
305
319
  # orthogonalize predictions wrt meta model
306
- neutral_preds = orthogonalize(p, m)
320
+ neutral_preds = orthogonalize(p, cast(np.ndarray, m))
307
321
 
308
322
  # convert target to buckets [-2, -1, 0, 1, 2]
309
323
  if (live_targets >= 0).all() and (live_targets <= 1).all():
@@ -314,9 +328,9 @@ def correlation_contribution(
314
328
  # filter each column to its top and bottom n predictions
315
329
  neutral_preds_df = pd.DataFrame(
316
330
  neutral_preds, columns=predictions.columns, index=predictions.index
317
- ).apply(lambda p: filter_sort_top_bottom(p, top_bottom))
318
- # create a dataframe for targets to match the filtered predictions
319
- live_targets = (
331
+ ).apply(lambda p: filter_sort_top_bottom_concat(p, top_bottom))
332
+ mmc_matrix = (
333
+ # create a dataframe for targets to match the filtered predictions
320
334
  neutral_preds_df.apply(
321
335
  lambda p: filter_sort_index(
322
336
  p,
@@ -326,19 +340,15 @@ def correlation_contribution(
326
340
  )
327
341
  .fillna(0)
328
342
  .T.values
329
- )
330
- # fillna with 0 so we don't get NaNs in the dot product
331
- neutral_preds = neutral_preds_df.fillna(0).values
332
-
333
- # multiply target and neutralized predictions
334
- # this is equivalent to covariance b/c mean = 0
335
- mmc = live_targets @ neutral_preds
336
- if top_bottom is not None and top_bottom > 0:
343
+ # then fill NaNs with 0 so we don't get NaNs in the dot product
344
+ # and mutiply target w/ neutral preds to get MMC
345
+ ) @ neutral_preds_df.fillna(0).values
337
346
  # only the diagonal is the proper score
338
- mmc = np.diag(mmc) / (top_bottom * 2)
347
+ mmc = np.diag(mmc_matrix) / (top_bottom * 2)
339
348
  else:
340
- mmc /= len(live_targets)
341
-
349
+ # multiply target and neutralized predictions
350
+ # this is equivalent to covariance b/c mean = 0
351
+ mmc = (live_targets @ neutral_preds) / len(live_targets)
342
352
  return pd.Series(mmc, index=predictions.columns)
343
353
 
344
354
 
@@ -522,10 +532,10 @@ def max_feature_correlation(
522
532
  feature_correlations = features.apply(
523
533
  lambda f: pearson_correlation(f, s, top_bottom)
524
534
  )
525
- feature_correlations = np.abs(feature_correlations)
535
+ feature_correlations = feature_correlations.abs()
526
536
  max_feature = feature_correlations.idxmax()
527
537
  max_corr = feature_correlations[max_feature]
528
- return max_feature, max_corr
538
+ return str(max_feature), max_corr
529
539
 
530
540
 
531
541
  def generate_neutralized_weights(
@@ -608,9 +618,9 @@ def meta_portfolio_contribution(
608
618
  s_prime, neutralizers, sample_weights
609
619
  )
610
620
  )
611
- w = weights[stakes.index].values
612
- s = stake_weights.values
613
- t = targets.values
621
+ w = cast(np.ndarray, weights[stakes.index].values)
622
+ s = cast(np.ndarray, stake_weights.values)
623
+ t = cast(np.ndarray, targets.values)
614
624
  swp = w @ s
615
625
  swp = swp - swp.mean()
616
626
  l1_norm = np.sum(np.abs(swp))
@@ -0,0 +1,217 @@
1
+ from typing import Tuple, Optional
2
+
3
+ from numerai_tools.submissions import validate_headers_signals, validate_ids_signals
4
+ from numerai_tools.scoring import (
5
+ filter_sort_index,
6
+ filter_sort_top_bottom,
7
+ spearman_correlation,
8
+ tie_kept_rank,
9
+ tie_kept_rank__gaussianize__pow_1_5,
10
+ filter_sort_index_many,
11
+ generate_neutralized_weights,
12
+ weight_normalize,
13
+ center,
14
+ )
15
+
16
+ import pandas as pd
17
+
18
+
19
+ def churn(
20
+ s1: pd.Series,
21
+ s2: pd.Series,
22
+ top_bottom: Optional[int] = None,
23
+ ) -> float:
24
+ """Calculate the churn between two series. Churn is the proportion of elements
25
+ that are different between the two series.
26
+
27
+ For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
28
+ If top_bottom is provided, the churn is calculated as the average of the % of
29
+ tickers that stay in the top and bottom predictions. This is only relevant when
30
+ the series are rank signals and not portfolio weights.
31
+
32
+ Arguments:
33
+ s1: pd.Series - the first series to compare
34
+ s2: pd.Series - the second series to compare
35
+ top_bottom: Optional[int] - the number of top and bottom predictions to use
36
+ when calculating the correlation. Results in
37
+ 2*top_bottom predictions.
38
+
39
+ Returns:
40
+ float - the churn between the two series
41
+ """
42
+ if top_bottom is not None and top_bottom > 0:
43
+ s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom)
44
+ s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom)
45
+ top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
46
+ bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
47
+ avg_overlap = (top_overlap + bot_overlap) / 2
48
+ return 1 - avg_overlap
49
+
50
+ s1, s2 = filter_sort_index(s1, s2)
51
+ assert s1.std() > 0, "s1 must have non-zero standard deviation"
52
+ assert s2.std() > 0, "s2 must have non-zero standard deviation"
53
+ return 1 - spearman_correlation(s1, s2)
54
+
55
+
56
+ def turnover(
57
+ s1: pd.Series,
58
+ s2: pd.Series,
59
+ ):
60
+ """Calculate the turnover between two series. Turnover is the total change in weights between
61
+ the two series divided by 2.
62
+
63
+ For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
64
+ and calculate turnover as the absolute total difference between the two series divided by 2.
65
+ This is only relevant when the series are portfolio weights and not rank signals.
66
+
67
+ Arguments:
68
+ s1: pd.Series - the first series to compare
69
+ s2: pd.Series - the second series to compare
70
+ top_bottom: Optional[int] - the number of top and bottom predictions to use
71
+ when calculating the correlation. Results in
72
+ 2*top_bottom predictions.
73
+
74
+ Returns:
75
+ float - the turnover between the two series
76
+ """
77
+ s1, s2 = filter_sort_index(s1, s2)
78
+ turnover = (s1 - s2).abs().sum() / 2
79
+ return turnover
80
+
81
+
82
+ def neutral_weight(
83
+ submission: pd.Series,
84
+ signal_col: str,
85
+ neutralizer: pd.DataFrame,
86
+ weight: pd.Series,
87
+ ) -> pd.Series:
88
+ s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())
89
+ s_prime, neutralizer, weight = filter_sort_index_many(
90
+ [s_prime, neutralizer, weight]
91
+ )
92
+ neutral_weights = generate_neutralized_weights(
93
+ s_prime[signal_col], neutralizer, weight
94
+ )
95
+ neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
96
+ return neutral_weights.sort_index()
97
+
98
+
99
+ def remap_ticker_col(
100
+ predictions: pd.DataFrame,
101
+ universe: pd.DataFrame,
102
+ ticker_col: str,
103
+ ) -> pd.DataFrame:
104
+ return (
105
+ predictions.join(universe, how="right")
106
+ .reset_index()
107
+ .set_index(ticker_col)
108
+ .sort_index()
109
+ )
110
+
111
+
112
+ def rank_and_fill_signal(
113
+ universe: pd.DataFrame,
114
+ submission: pd.Series,
115
+ signal_col: str,
116
+ ) -> pd.Series:
117
+ uni_joined_sub = universe.sort_index().join(
118
+ tie_kept_rank(submission.sort_index().to_frame())
119
+ )[[signal_col]]
120
+ filled_sub = uni_joined_sub.fillna(uni_joined_sub.median()).sort_index()
121
+ return filled_sub[signal_col]
122
+
123
+
124
+ def calculate_max_churn_and_turnover(
125
+ curr_sub: pd.DataFrame,
126
+ curr_neutralizer: pd.DataFrame,
127
+ curr_weight: pd.Series,
128
+ prev_week_subs: dict[str, pd.DataFrame],
129
+ prev_neutralizers: dict[str, pd.DataFrame],
130
+ prev_sample_weights: dict[str, pd.Series],
131
+ universe: pd.DataFrame,
132
+ curr_signal_col: str,
133
+ curr_ticker_col: str,
134
+ ) -> Tuple[float, float]:
135
+ """Calculate the maximum churn and turnover with respect to previous submissions.
136
+
137
+ Arguments:
138
+ curr_sub -- the current submission
139
+ curr_neutralizer -- the neutralizer DataFrame for the current submission
140
+ curr_weight -- the sample weights Series for the current submission
141
+ prev_week_subs -- a dictionary of datestamps to submissions
142
+ prev_neutralizers -- a dictionary of datestamps to neutralizers
143
+ prev_sample_weights -- a dictionary of datestamps to sample weights
144
+ universe -- the internal universe DataFrame
145
+ curr_signal_col -- the column name for signal in the current submission
146
+ curr_ticker_col -- the column name for tickers in the current submission
147
+
148
+ Returns:
149
+ prev_week_max_churn -- the maximum churn from previous submissions
150
+ prev_week_max_turnover -- the maximum turnover from previous submissions
151
+ """
152
+ curr_sub_vector: pd.Series = rank_and_fill_signal(
153
+ universe,
154
+ curr_sub.reset_index().set_index(curr_ticker_col).sort_index()[curr_signal_col],
155
+ curr_signal_col,
156
+ )
157
+ churn_stats = []
158
+ turnover_stats = []
159
+ neutralized_weights = neutral_weight(
160
+ curr_sub_vector, curr_signal_col, curr_neutralizer, curr_weight
161
+ )
162
+ for datestamp in prev_week_subs:
163
+ prev_sub = prev_week_subs[datestamp]
164
+ prev_neutralizer = prev_neutralizers[datestamp]
165
+ prev_weight = prev_sample_weights[datestamp]
166
+ prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub)
167
+ prev_universe = universe.reset_index().set_index(prev_ticker_col)
168
+ filtered_prev_sub_df, _ = validate_ids_signals(
169
+ prev_universe.index.to_series(), prev_sub, prev_ticker_col
170
+ )
171
+ # in case the previous submission has a different ticker column,
172
+ # remap the ticker column of prev data to the current ticker column
173
+ filtered_prev_sub = remap_ticker_col(
174
+ filtered_prev_sub_df.set_index(prev_ticker_col),
175
+ universe=prev_universe,
176
+ ticker_col=curr_ticker_col,
177
+ )[curr_signal_col]
178
+ filtered_prev_sub = rank_and_fill_signal(
179
+ universe=universe,
180
+ submission=filtered_prev_sub,
181
+ signal_col=curr_signal_col,
182
+ )
183
+ prev_neutralizer = remap_ticker_col(
184
+ prev_neutralizer,
185
+ universe=prev_universe,
186
+ ticker_col=curr_ticker_col,
187
+ ).filter(like="neutralizer_")
188
+ prev_weight = remap_ticker_col(
189
+ prev_weight.to_frame(),
190
+ universe=prev_universe,
191
+ ticker_col=curr_ticker_col,
192
+ )[prev_weight.name]
193
+ prev_neutralized_weights = neutral_weight(
194
+ filtered_prev_sub, prev_signal_col, prev_neutralizer, prev_weight
195
+ )
196
+ try:
197
+ churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
198
+ except AssertionError as e:
199
+ if "does not have enough overlapping ids" in str(e):
200
+ continue
201
+ try:
202
+ turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
203
+ except AssertionError as e:
204
+ if "does not have enough overlapping ids" in str(e):
205
+ continue
206
+
207
+ churn_stats.append(churn_val)
208
+ turnover_stats.append(turnover_val)
209
+ if len(churn_stats) == 0:
210
+ prev_week_max_churn = 1.0
211
+ else:
212
+ prev_week_max_churn = max(churn_stats)
213
+ if len(turnover_stats) == 0:
214
+ prev_week_max_turnover = 1.0
215
+ else:
216
+ prev_week_max_turnover = max(turnover_stats)
217
+ return prev_week_max_churn, prev_week_max_turnover
@@ -1,5 +1,6 @@
1
1
  from numerai_tools.scoring import tie_kept_rank
2
2
 
3
+ import logging
3
4
  from typing import Tuple, List
4
5
 
5
6
  import pandas as pd
@@ -16,12 +17,15 @@ SIGNALS_ALLOWED_ID_COLS = [
16
17
  "numerai_ticker",
17
18
  ]
18
19
  SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
20
+ SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
19
21
  SIGNALS_MIN_TICKERS = 100
20
22
 
21
23
  CRYPTO_ALLOWED_ID_COLS = ["symbol"]
22
24
  CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
23
25
  CRYPTO_MIN_TICKERS = 100
24
26
 
27
+ logger = logging.getLogger(__name__)
28
+
25
29
 
26
30
  def _validate_headers(
27
31
  expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
@@ -58,6 +62,17 @@ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
58
62
 
59
63
 
60
64
  def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
65
+ if "data_type" in submission.columns:
66
+ logger.warning(
67
+ "data_type column found in Signals submission. This is deprecated and will be removed in the future. "
68
+ "Please remove the data_type column from your Signals submission."
69
+ )
70
+ date_col = [
71
+ date_col
72
+ for date_col in SIGNALS_ALLOWED_DATE_COLS
73
+ if date_col in list(submission.columns)
74
+ ]
75
+ submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
61
76
  return _validate_headers(
62
77
  SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
63
78
  )
@@ -155,7 +170,7 @@ def clean_predictions(
155
170
  predictions: pd.DataFrame,
156
171
  id_col: str,
157
172
  rank_and_fill: bool,
158
- ) -> pd.Series:
173
+ ) -> pd.DataFrame:
159
174
  """Prepare predictions for submission to Numerai.
160
175
  Filters out ids not in live data, drops duplicates, sets ids as index,
161
176
  then optionally ranks (keeping ties) and fills NaNs with 0.5.
@@ -169,6 +184,7 @@ def clean_predictions(
169
184
  predictions: pd.DataFrame - the predictions to clean
170
185
  id_col: str - the column name of the ids
171
186
  rank_and_fill: bool - whether to rank and fill NaNs with 0.5
187
+ left_join_ids: bool - whether to left join the predictions onto the ids
172
188
  """
173
189
  assert len(live_ids) > 0, "live_ids must not be empty"
174
190
  assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
@@ -177,13 +193,15 @@ def clean_predictions(
177
193
  # drop null indices
178
194
  predictions = predictions[~predictions[id_col].isna()]
179
195
  predictions = (
180
- predictions
181
- # filter out ids not in live data
182
- [predictions[id_col].isin(live_ids)]
196
+ predictions[
197
+ # filter out ids not in live data
198
+ predictions[id_col].isin(live_ids)
199
+ ]
183
200
  # drop duplicate ids (keep first)
184
201
  .drop_duplicates(subset=id_col, keep="first")
185
202
  # set ids as index
186
- .set_index(id_col).sort_index()
203
+ .set_index(id_col)
204
+ .sort_index()
187
205
  )
188
206
  # rank and fill with 0.5
189
207
  if rank_and_fill:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: numerai-tools
3
- Version: 0.4.3
3
+ Version: 0.5.0.dev1
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  Home-page: https://github.com/numerai/numerai-tools
6
6
  Maintainer: Numerai
@@ -1,7 +1,7 @@
1
1
  from setuptools import setup
2
2
  from setuptools import find_packages
3
3
 
4
- VERSION = "0.4.3"
4
+ VERSION = "0.5.0.dev1"
5
5
 
6
6
 
7
7
  def load(path):
@@ -22,6 +22,7 @@ from numerai_tools.scoring import (
22
22
  filter_sort_index,
23
23
  filter_sort_index_many,
24
24
  filter_sort_top_bottom,
25
+ filter_sort_top_bottom_concat,
25
26
  alpha,
26
27
  meta_portfolio_contribution,
27
28
  )
@@ -296,13 +297,12 @@ class TestScoring(unittest.TestCase):
296
297
  top_bottom=None,
297
298
  )
298
299
  np.testing.assert_allclose(
299
- filter_sort_top_bottom(self.up, top_bottom=2),
300
+ filter_sort_top_bottom_concat(self.up, top_bottom=2),
300
301
  [0, 1, 3, 4],
301
302
  )
302
303
  top, bot = filter_sort_top_bottom(
303
304
  self.up,
304
305
  top_bottom=2,
305
- return_concatenated=False,
306
306
  )
307
307
  np.testing.assert_allclose(top, [3, 4])
308
308
  np.testing.assert_allclose(bot, [0, 1])
@@ -0,0 +1,139 @@
1
+ import unittest
2
+
3
+ import numpy as np
4
+ import pandas as pd # type: ignore
5
+
6
+ from numerai_tools.signals import (
7
+ churn,
8
+ turnover,
9
+ calculate_max_churn_and_turnover,
10
+ )
11
+ from .util import (
12
+ generate_fake_universe,
13
+ generate_new_submission,
14
+ )
15
+
16
+
17
+ class TestSignals(unittest.TestCase):
18
+ def setUp(self):
19
+ self.up = pd.Series(list(range(5))).rename("up")
20
+ self.down = pd.Series(list(reversed(range(5)))).rename("down")
21
+ self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
22
+ self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
23
+ self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
24
+
25
+ def test_churn(self):
26
+ assert np.isclose(churn(self.up, self.up), 0)
27
+ assert np.isclose(churn(self.up, self.up_down), 1)
28
+ assert np.isclose(churn(self.up, self.oscillate), 1)
29
+ assert np.isclose(churn(self.up, self.down), 2)
30
+ self.assertRaisesRegex(
31
+ AssertionError,
32
+ "s2 must have non-zero standard deviation",
33
+ churn,
34
+ self.up,
35
+ self.constant,
36
+ )
37
+
38
+ def test_churn_tb(self):
39
+ tmp = churn(self.up, self.up, top_bottom=2)
40
+ assert np.isclose(tmp, 0), tmp
41
+ tmp = churn(self.up, self.up_down, top_bottom=2)
42
+ assert np.isclose(tmp, 0.5), tmp
43
+ tmp = churn(self.up, self.oscillate, top_bottom=2)
44
+ assert np.isclose(tmp, 0.5), tmp
45
+ tmp = churn(self.up, self.down, top_bottom=2)
46
+ assert np.isclose(tmp, 1), tmp
47
+ tmp = churn(self.up, self.constant, top_bottom=2)
48
+ assert np.isclose(tmp, 0), tmp
49
+
50
+ def test_turnover(self):
51
+ assert np.isclose(turnover(self.up, self.up), 0)
52
+ assert np.isclose(turnover(self.up, self.up_down), 3)
53
+ assert np.isclose(turnover(self.up, self.oscillate), 4.5)
54
+ assert np.isclose(turnover(self.up, self.down), 6)
55
+ assert np.isclose(turnover(self.up, self.constant), 3.5)
56
+
57
+ def test_churn_first_submission(self):
58
+ """
59
+ Test that the churn function works for the first submission
60
+ No exceptions should be raised, should return 1
61
+ """
62
+ fake_universe = generate_fake_universe("20130308")
63
+ fake_submission = generate_new_submission(fake_universe)
64
+ fake_neutralizers = pd.DataFrame(
65
+ {
66
+ "neutralizer_1": [0.1] * len(fake_universe),
67
+ "neutralizer_2": [0.2] * len(fake_universe),
68
+ },
69
+ index=fake_universe["numerai_ticker"],
70
+ )
71
+ fake_sample_weights = pd.Series(
72
+ [0.5] * len(fake_universe),
73
+ index=fake_universe["numerai_ticker"],
74
+ name="sample_weight",
75
+ )
76
+ churn, turnover = calculate_max_churn_and_turnover(
77
+ curr_sub=fake_submission,
78
+ curr_neutralizer=fake_neutralizers,
79
+ curr_weight=fake_sample_weights,
80
+ prev_week_subs=[],
81
+ prev_neutralizers={"20240208": fake_neutralizers},
82
+ prev_sample_weights={"20240208": fake_sample_weights},
83
+ universe=fake_universe.set_index("numerai_ticker").sort_index(),
84
+ curr_signal_col="signal",
85
+ curr_ticker_col="numerai_ticker",
86
+ )
87
+ assert np.isclose(churn, 1)
88
+ assert np.isclose(turnover, 1)
89
+
90
+ def test_churn_handles_different_id_columns(self):
91
+ """
92
+ Test that the churn function works when
93
+ previous submission has different id columns.
94
+ """
95
+ fake_universe = generate_fake_universe("20130308")
96
+ fake_submission = generate_new_submission(fake_universe, legacy_headers=True)
97
+ new_fake_universe = generate_fake_universe(
98
+ date_value="20130308", ticker_col="ticker"
99
+ )
100
+ fake_universe["ticker"] = new_fake_universe["ticker"]
101
+ prev_submission = fake_submission.copy()
102
+ fake_neutralizers = pd.DataFrame(
103
+ {
104
+ "neutralizer_1": [0.1] * len(fake_universe),
105
+ "neutralizer_2": [0.2] * len(fake_universe),
106
+ },
107
+ index=fake_universe["numerai_ticker"],
108
+ )
109
+ fake_sample_weights = pd.Series(
110
+ [0.5] * len(fake_universe),
111
+ index=fake_universe["numerai_ticker"],
112
+ name="sample_weight",
113
+ )
114
+ # switch out the numerai_ticke col in-place
115
+ prev_submission["numerai_ticker"] = new_fake_universe["ticker"]
116
+ prev_submission.rename(columns={"numerai_ticker": "ticker"}, inplace=True)
117
+ prev_neutralizers = fake_neutralizers.copy()
118
+ prev_neutralizers.index = new_fake_universe["ticker"]
119
+ prev_neutralizers.index.name = "ticker"
120
+ prev_sample_weights = fake_sample_weights.copy()
121
+ prev_sample_weights.index = new_fake_universe["ticker"]
122
+ prev_sample_weights.index.name = "ticker"
123
+ churn, turnover = calculate_max_churn_and_turnover(
124
+ curr_sub=fake_submission,
125
+ curr_neutralizer=fake_neutralizers,
126
+ curr_weight=fake_sample_weights,
127
+ prev_week_subs={"20240208": prev_submission},
128
+ prev_neutralizers={"20240208": prev_neutralizers},
129
+ prev_sample_weights={"20240208": prev_sample_weights},
130
+ universe=fake_universe.set_index("numerai_ticker").sort_index(),
131
+ curr_signal_col="signal",
132
+ curr_ticker_col="numerai_ticker",
133
+ )
134
+ assert np.isclose(churn, 0)
135
+ assert np.isclose(turnover, 0)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ unittest.main()
@@ -155,6 +155,18 @@ class TestSubmissions(unittest.TestCase):
155
155
  sub[[sub.columns[1]]],
156
156
  )
157
157
 
158
+ def test_validate_headers_signals_data_type_and_date_col(self):
159
+ fake_sub = generate_submission(self.ids, "ticker", "signal")
160
+ fake_sub["data_type"] = "signals"
161
+ fake_sub["friday_date"] = "2023-01-01"
162
+ with self.assertLogs(level="WARNING") as cm:
163
+ assert validate_headers_signals(fake_sub) == ("ticker", "signal")
164
+ self.assertIn(
165
+ "WARNING:numerai_tools.submissions:data_type column found in Signals submission. This is deprecated and will be removed in the future. "
166
+ "Please remove the data_type column from your Signals submission.",
167
+ cm.output[0],
168
+ )
169
+
158
170
  def test_validate_headers_crypto(self):
159
171
  for sub in self.crypto_subs:
160
172
  assert validate_headers_crypto(sub) == tuple(sub.columns)
@@ -432,7 +444,7 @@ class TestSubmissions(unittest.TestCase):
432
444
  assert not cleaned_predictions.index.duplicated().any()
433
445
 
434
446
 
435
- def generate_ids(id_length: int, num_rows: int) -> List[str]:
447
+ def generate_ids(id_length: int, num_rows: int) -> pd.Series:
436
448
  """Generates a given number of unique ascii-valued strings of a given length.
437
449
 
438
450
  Arguments:
@@ -1,72 +0,0 @@
1
- from numerai_tools.scoring import (
2
- filter_sort_index,
3
- filter_sort_top_bottom,
4
- spearman_correlation,
5
- )
6
-
7
- from typing import List, Tuple, Union, Optional
8
-
9
- import pandas as pd
10
-
11
-
12
- def churn(
13
- s1: pd.Series,
14
- s2: pd.Series,
15
- top_bottom: Optional[int] = None,
16
- ) -> float:
17
- """Calculate the churn between two series. Churn is the proportion of elements
18
- that are different between the two series.
19
-
20
- For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
21
- If top_bottom is provided, the churn is calculated as the average of the % of
22
- tickers that stay in the top and bottom predictions. This is only relevant when
23
- the series are rank signals and not portfolio weights.
24
-
25
- Arguments:
26
- s1: pd.Series - the first series to compare
27
- s2: pd.Series - the second series to compare
28
- top_bottom: Optional[int] - the number of top and bottom predictions to use
29
- when calculating the correlation. Results in
30
- 2*top_bottom predictions.
31
-
32
- Returns:
33
- float - the churn between the two series
34
- """
35
- if top_bottom is not None and top_bottom > 0:
36
- s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
37
- s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
38
- top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
39
- bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
40
- avg_overlap = (top_overlap + bot_overlap) / 2
41
- return 1 - avg_overlap
42
-
43
- s1, s2 = filter_sort_index(s1, s2)
44
- assert s1.std() > 0, "s1 must have non-zero standard deviation"
45
- assert s2.std() > 0, "s2 must have non-zero standard deviation"
46
- return 1 - spearman_correlation(s1, s2)
47
-
48
-
49
- def turnover(
50
- s1: pd.Series,
51
- s2: pd.Series,
52
- ):
53
- """Calculate the turnover between two series. Turnover is the total change in weights between
54
- the two series divided by 2.
55
-
56
- For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
57
- and calculate turnover as the absolute total difference between the two series divided by 2.
58
- This is only relevant when the series are portfolio weights and not rank signals.
59
-
60
- Arguments:
61
- s1: pd.Series - the first series to compare
62
- s2: pd.Series - the second series to compare
63
- top_bottom: Optional[int] - the number of top and bottom predictions to use
64
- when calculating the correlation. Results in
65
- 2*top_bottom predictions.
66
-
67
- Returns:
68
- float - the turnover between the two series
69
- """
70
- s1, s2 = filter_sort_index(s1, s2)
71
- turnover = (s1 - s2).abs().sum() / 2
72
- return turnover
@@ -1,51 +0,0 @@
1
- import unittest
2
-
3
- import numpy as np
4
- import pandas as pd # type: ignore
5
-
6
- from numerai_tools.signals import churn, turnover
7
-
8
-
9
- class TestSignals(unittest.TestCase):
10
- def setUp(self):
11
- self.up = pd.Series(list(range(5))).rename("up")
12
- self.down = pd.Series(list(reversed(range(5)))).rename("down")
13
- self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
14
- self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
15
- self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
16
-
17
- def test_churn(self):
18
- assert np.isclose(churn(self.up, self.up), 0)
19
- assert np.isclose(churn(self.up, self.up_down), 1)
20
- assert np.isclose(churn(self.up, self.oscillate), 1)
21
- assert np.isclose(churn(self.up, self.down), 2)
22
- self.assertRaisesRegex(
23
- AssertionError,
24
- "s2 must have non-zero standard deviation",
25
- churn,
26
- self.up,
27
- self.constant,
28
- )
29
-
30
- def test_churn_tb(self):
31
- tmp = churn(self.up, self.up, top_bottom=2)
32
- assert np.isclose(tmp, 0), tmp
33
- tmp = churn(self.up, self.up_down, top_bottom=2)
34
- assert np.isclose(tmp, 0.5), tmp
35
- tmp = churn(self.up, self.oscillate, top_bottom=2)
36
- assert np.isclose(tmp, 0.5), tmp
37
- tmp = churn(self.up, self.down, top_bottom=2)
38
- assert np.isclose(tmp, 1), tmp
39
- tmp = churn(self.up, self.constant, top_bottom=2)
40
- assert np.isclose(tmp, 0), tmp
41
-
42
- def test_turnover(self):
43
- assert np.isclose(turnover(self.up, self.up), 0)
44
- assert np.isclose(turnover(self.up, self.up_down), 3)
45
- assert np.isclose(turnover(self.up, self.oscillate), 4.5)
46
- assert np.isclose(turnover(self.up, self.down), 6)
47
- assert np.isclose(turnover(self.up, self.constant), 3.5)
48
-
49
-
50
- if __name__ == "__main__":
51
- unittest.main()