numerai-tools 0.5.0.dev3__tar.gz → 0.5.0.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev3
3
+ Version: 0.5.0.dev5
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -7,7 +7,8 @@ from numerai_tools.scoring import (
7
7
  generate_neutralized_weights,
8
8
  )
9
9
  from numerai_tools.submissions import (
10
- clean_submission_signals,
10
+ validate_submission_signals,
11
+ clean_submission,
11
12
  remap_ids,
12
13
  )
13
14
 
@@ -106,11 +107,20 @@ def calculate_max_churn_and_turnover(
106
107
  prev_week_max_turnover -- the maximum turnover from previous submissions
107
108
  """
108
109
  universe = universe.reset_index()
109
- curr_sub_vector = clean_submission_signals(
110
+ (
111
+ curr_ticker_col,
112
+ curr_signal_col,
113
+ curr_sub,
114
+ _,
115
+ ) = validate_submission_signals(
110
116
  universe=universe,
111
117
  submission=curr_sub,
112
- submission_id=curr_signal_col,
113
- index_col=curr_ticker_col,
118
+ )
119
+ curr_sub_vector = clean_submission(
120
+ universe=universe,
121
+ submission=curr_sub,
122
+ src_id_col=curr_ticker_col,
123
+ src_signal_col=curr_signal_col,
114
124
  rank_and_fill=True,
115
125
  )
116
126
  churn_stats = []
@@ -122,11 +132,22 @@ def calculate_max_churn_and_turnover(
122
132
  prev_sub = prev_week_subs[datestamp]
123
133
  prev_neutralizer = prev_neutralizers[datestamp]
124
134
  prev_weight = prev_sample_weights[datestamp]
125
- filtered_prev_sub = clean_submission_signals(
135
+ (
136
+ prev_ticker_col,
137
+ prev_signal_col,
138
+ prev_sub,
139
+ _,
140
+ ) = validate_submission_signals(
141
+ universe=universe,
142
+ submission=prev_sub,
143
+ )
144
+ filtered_prev_sub = clean_submission(
126
145
  universe=universe,
127
146
  submission=prev_sub,
128
- submission_id=curr_signal_col,
129
- index_col=curr_ticker_col,
147
+ src_id_col=prev_ticker_col,
148
+ src_signal_col=prev_signal_col,
149
+ dst_id_col=curr_ticker_col,
150
+ dst_signal_col=curr_signal_col,
130
151
  rank_and_fill=True,
131
152
  )
132
153
  prev_neutralizer = (
@@ -0,0 +1,335 @@
1
+ from numerai_tools.scoring import tie_kept_rank
2
+
3
+ import logging
4
+ from typing import Tuple, List, Optional
5
+
6
+ import pandas as pd
7
+ import numpy as np
8
+
9
+ NUMERAI_ALLOWED_ID_COLS = ["id"]
10
+ NUMERAI_ALLOWED_PRED_COLS = ["prediction", "probability"]
11
+
12
+ SIGNALS_ALLOWED_ID_COLS = [
13
+ "ticker",
14
+ "sedol",
15
+ "bloomberg_ticker",
16
+ "composite_figi",
17
+ "numerai_ticker",
18
+ ]
19
+ SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
20
+ SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
21
+ SIGNALS_MIN_TICKERS = 100
22
+
23
+ CRYPTO_ALLOWED_ID_COLS = ["symbol"]
24
+ CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
25
+ CRYPTO_MIN_TICKERS = 100
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ def _validate_headers(
31
+ submission: pd.DataFrame,
32
+ expected_id_cols: List[str],
33
+ expected_pred_cols: List[str],
34
+ other_cols: Optional[List[str]] = None,
35
+ ) -> Tuple[str, str]:
36
+ """Validate the given submission has the right headers.
37
+ It is recommended to use one of the following functions instead of this one:
38
+ - validate_headers_numerai
39
+ - validate_headers_signals
40
+
41
+ Arguments:
42
+ submission -- pandas DataFrame of the submission
43
+ expected_id_cols -- list of expected id columns
44
+ expected_pred_cols -- list of expected prediction columns
45
+ other_cols -- optional list of other columns that can be present in the submission
46
+
47
+ Return Tuple[str, str]:
48
+ - string name of the id column
49
+ - string name of the prediction column
50
+ """
51
+ expected_headers = [
52
+ [ticker_col, signal_col]
53
+ for ticker_col in expected_id_cols
54
+ for signal_col in expected_pred_cols
55
+ ]
56
+ if other_cols is not None:
57
+ expected_headers += [
58
+ [ticker_col, signal_col, other_col]
59
+ for ticker_col in expected_id_cols
60
+ for signal_col in expected_pred_cols
61
+ for other_col in other_cols
62
+ ]
63
+ columns = submission.columns
64
+ valid_headers = list(columns) in expected_headers
65
+ assert valid_headers, (
66
+ "invalid_submission_headers: headers must be one of"
67
+ f" {expected_id_cols} and one of {expected_pred_cols}"
68
+ )
69
+ return columns[0], columns[1]
70
+
71
+
72
+ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
73
+ return _validate_headers(
74
+ submission,
75
+ NUMERAI_ALLOWED_ID_COLS,
76
+ NUMERAI_ALLOWED_PRED_COLS,
77
+ )
78
+
79
+
80
+ def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
81
+ return _validate_headers(
82
+ submission,
83
+ SIGNALS_ALLOWED_ID_COLS,
84
+ SIGNALS_ALLOWED_PRED_COLS,
85
+ SIGNALS_ALLOWED_DATE_COLS,
86
+ )
87
+
88
+
89
+ def validate_headers_crypto(submission: pd.DataFrame) -> Tuple[str, str]:
90
+ return _validate_headers(
91
+ submission,
92
+ CRYPTO_ALLOWED_ID_COLS,
93
+ CRYPTO_ALLOWED_PRED_COLS,
94
+ )
95
+
96
+
97
+ def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
98
+ """
99
+ Validates the given submission's values are between 0 and 1 exclusive and
100
+ that the submission have a non-zero standard deviation.
101
+
102
+ Arguments:
103
+ submission -- pandas DataFrame of the submission
104
+ prediction_col -- the string name of the prediction column returned by validate_headers
105
+ """
106
+ assert (
107
+ submission[prediction_col].isna().sum() == 0
108
+ ), "invalid_submission_values: submission must not contain NaNs"
109
+ assert (
110
+ submission[prediction_col].between(0, 1).all()
111
+ ), "invalid_submission_values: values must be between 0 and 1 exclusive"
112
+ assert not np.isclose(
113
+ 0, submission[prediction_col].std()
114
+ ), "invalid_submission_values: submission must have non-zero standard deviation"
115
+
116
+
117
+ def _validate_ids(
118
+ live_ids: pd.Series, submission: pd.DataFrame, id_col: str, min_tickers: int
119
+ ) -> Tuple[pd.DataFrame, List[str]]:
120
+ """
121
+ Validates the given submission has no NaNs in the given id column
122
+ and that the submission has a minimum number of non-duplicate ids
123
+ after filtering to the live_ids.
124
+
125
+ It is recommended to use one of the following functions instead of this one:
126
+ - validate_ids_numerai
127
+ - validate_ids_signals
128
+
129
+ Arguments:
130
+ live_ids -- pandas Series of the live ids or tickers from live universe
131
+ submission -- pandas DataFrame of the submission
132
+ id_col -- the stringn name of the column containing ids or tickers
133
+
134
+ Return Tuple[pd.DataFrame, List[str]]:
135
+ - submission indexed on id_col and filtered against live_ids
136
+ - set of invalid tickers (diff between indexed sub and live_ids-joined sub)
137
+ """
138
+ assert (
139
+ not submission[id_col].isna().any()
140
+ ), f"invalid_submission_ids: Submission must not contain NaNs in the {id_col} column."
141
+
142
+ index_sub = submission.copy()
143
+ index_sub[id_col] = index_sub[id_col].astype(str)
144
+
145
+ live_ids = live_ids.astype(str)
146
+ live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
147
+ assert (
148
+ not live_sub[id_col].duplicated().any()
149
+ ), f"invalid_submission_ids: Duplicates detected in {id_col} for live period."
150
+
151
+ # join on live_ids and ensure min tickers reached
152
+ assert len(live_sub) >= min_tickers, (
153
+ "invalid_submission_ids: Not enough stocks submitted."
154
+ " Are you using the latest live ids or live universe?"
155
+ )
156
+
157
+ invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
158
+ return live_sub, invalid_tickers
159
+
160
+
161
+ def validate_ids_numerai(
162
+ live_ids: pd.Series, submission: pd.DataFrame, id_col: str
163
+ ) -> Tuple[pd.DataFrame, List[str]]:
164
+ return _validate_ids(live_ids, submission, id_col, len(live_ids))
165
+
166
+
167
+ def validate_ids_signals(
168
+ live_ids: pd.Series, submission: pd.DataFrame, id_col: str
169
+ ) -> Tuple[pd.DataFrame, List[str]]:
170
+ return _validate_ids(live_ids, submission, id_col, SIGNALS_MIN_TICKERS)
171
+
172
+
173
+ def validate_ids_crypto(
174
+ live_ids: pd.Series, submission: pd.DataFrame, id_col: str
175
+ ) -> Tuple[pd.DataFrame, List[str]]:
176
+ return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
177
+
178
+
179
+ def validate_submission_numerai(
180
+ universe: pd.Series, submission: pd.DataFrame
181
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
182
+ """Validate the headers, ids, and values for a submission.
183
+
184
+ Arguments:
185
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
186
+ submission: pd.DataFrame - the predictions to validate
187
+
188
+ Returns:
189
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
190
+ filtered submission, and list of invalid tickers
191
+ """
192
+ ticker_col, signal_col = validate_headers_numerai(submission)
193
+ filtered_sub, invalid_tickers = validate_ids_numerai(
194
+ universe, submission, ticker_col
195
+ )
196
+ validate_values(filtered_sub, signal_col)
197
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
198
+
199
+
200
+ def validate_submission_signals(
201
+ universe: pd.DataFrame, submission: pd.DataFrame
202
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
203
+ """Validate the headers, ids, and values for a submission.
204
+
205
+ Arguments:
206
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
207
+ submission: pd.DataFrame - the predictions to validate
208
+
209
+ Returns:
210
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
211
+ filtered submission, and list of invalid tickers
212
+ """
213
+ # drop data_type and date columns if they exist
214
+ if "data_type" in submission.columns:
215
+ logger.warning(
216
+ "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
217
+ "Please remove the data_type column from your Signals submission."
218
+ )
219
+ submission = submission.drop(columns=["data_type"], errors="ignore")
220
+ ticker_col, signal_col = validate_headers_signals(submission)
221
+ filtered_sub, invalid_tickers = validate_ids_signals(
222
+ universe[ticker_col], submission, ticker_col
223
+ )
224
+ validate_values(filtered_sub, signal_col)
225
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
226
+
227
+
228
+ def validate_submission_crypto(
229
+ universe: pd.DataFrame, submission: pd.DataFrame
230
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
231
+ """Validate the headers, ids, and values for a submission.
232
+
233
+ Arguments:
234
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
235
+ submission: pd.DataFrame - the predictions to validate
236
+
237
+ Returns:
238
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
239
+ filtered submission, and list of invalid tickers
240
+ """
241
+ print(universe)
242
+ ticker_col, signal_col = validate_headers_crypto(submission)
243
+ filtered_sub, invalid_tickers = validate_ids_crypto(
244
+ universe[ticker_col], submission, ticker_col
245
+ )
246
+ validate_values(filtered_sub, signal_col)
247
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
248
+
249
+
250
+ def remap_ids(
251
+ data: pd.DataFrame,
252
+ ticker_map: pd.Series | pd.DataFrame,
253
+ src_id_col: str,
254
+ dst_id_col: str,
255
+ ) -> pd.DataFrame:
256
+ """Join the data to the ticker map based on source ids
257
+ and remap to the destination ids. If the ticker is a Series, it is assumed that
258
+ src_id_col and dst_id_col are the same, and the ticker map is simply used to
259
+ ensure the data has all ids in the ticker map.
260
+
261
+ Arguments:
262
+ data: pd.DataFrame - the data to remap
263
+ ticker_map: pd.Series | pd.DataFrame - the mapping of source ids to destination ids
264
+ src_id_col: str - the name of the source ids column in the data
265
+ dst_id_col: str - the name of the destination ids column in the ticker map
266
+ """
267
+ # first, index the universe and data on the source ids
268
+ indexed_map = ticker_map.reset_index().set_index(src_id_col)
269
+ indexed_data = data.set_index(src_id_col)
270
+ return (
271
+ # then, join the universe and data
272
+ indexed_map.join(indexed_data)
273
+ # get just the destination ids and prediction columns
274
+ .reset_index()[[dst_id_col, *indexed_data.columns]]
275
+ # finally, sort by the destination ticker column
276
+ .sort_values(dst_id_col)
277
+ )
278
+
279
+
280
+ def clean_submission(
281
+ universe: pd.Series | pd.DataFrame,
282
+ submission: pd.DataFrame,
283
+ src_id_col: str,
284
+ src_signal_col: str,
285
+ dst_id_col: Optional[str] = None,
286
+ dst_signal_col: Optional[str] = None,
287
+ rank_and_fill: bool = False,
288
+ ) -> pd.Series:
289
+ """Prepares your submission for uploading to a Numerai tournament.
290
+ Joins your submission to the universe, remaps ids as neded, drops
291
+ duplicates, sets ids as index, renames the series, then optionally
292
+ tie-kept ranks and fills NaNs with 0.5.
293
+
294
+ This function is used in Numerai to clean submissions for use in the
295
+ Meta Model and scoring. We rank and fill submissions before scoring.
296
+
297
+ Arguments:
298
+ universe: pd.Series - the live universe of ids on which the predictions are based
299
+ submission: pd.DataFrame - the submission to clean
300
+ src_id_col: str - the name of the ids column
301
+ src_signal_col: str - the name of the predictions column
302
+ dst_id_col: Optional[str] - optional name of the id column to map the ids to
303
+ dst_signal_col: Optional[str] - optional name of the signal column to rename the submission to
304
+ rank_and_fill: bool - whether to call tie_kept_rank and then fill NaNs with 0.5
305
+
306
+ Returns:
307
+ pd.Series - the cleaned, properly indexed submission
308
+ """
309
+ assert len(universe) > 0, "universe must not be empty"
310
+ if isinstance(universe, pd.DataFrame):
311
+ assert universe.isna().sum().sum() == 0, "universe must not contain NaNs"
312
+ else:
313
+ assert universe.isna().sum() == 0, "universe must not contain NaNs"
314
+ assert len(submission) > 0, "predictions must not be empty"
315
+
316
+ if dst_id_col is None:
317
+ dst_id_col = src_id_col
318
+ if dst_signal_col is None:
319
+ dst_signal_col = src_signal_col
320
+
321
+ clean_preds = (
322
+ remap_ids(submission, universe, src_id_col, dst_id_col)
323
+ # drop NaNs and duplicates
324
+ .dropna(subset=[dst_id_col])
325
+ .drop_duplicates(subset=dst_id_col, keep="first")
326
+ # set ids as index and sort
327
+ .set_index(dst_id_col)
328
+ .sort_index()
329
+ # rename to given name
330
+ .rename(columns={src_signal_col: dst_signal_col})
331
+ )[dst_signal_col]
332
+ # rank and fill with 0.5
333
+ if rank_and_fill:
334
+ clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
335
+ return clean_preds
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev3"
3
+ version = "0.5.0.dev5"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}
@@ -1,290 +0,0 @@
1
- from numerai_tools.scoring import tie_kept_rank
2
-
3
- import logging
4
- from typing import Tuple, List
5
-
6
- import pandas as pd
7
- import numpy as np
8
-
9
- NUMERAI_ALLOWED_ID_COLS = ["id"]
10
- NUMERAI_ALLOWED_PRED_COLS = ["prediction", "probability"]
11
-
12
- SIGNALS_ALLOWED_ID_COLS = [
13
- "ticker",
14
- "sedol",
15
- "bloomberg_ticker",
16
- "composite_figi",
17
- "numerai_ticker",
18
- ]
19
- SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
20
- SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
21
- SIGNALS_MIN_TICKERS = 100
22
-
23
- CRYPTO_ALLOWED_ID_COLS = ["symbol"]
24
- CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
25
- CRYPTO_MIN_TICKERS = 100
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- def _validate_headers(
31
- expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
32
- ) -> Tuple[str, str]:
33
- """Validate the given submission has the right headers.
34
- It is recommended to use one of the following functions instead of this one:
35
- - validate_headers_numerai
36
- - validate_headers_signals
37
-
38
- Arguments:
39
- submission -- pandas DataFrame of the submission
40
-
41
- Return Tuple[str, str]:
42
- - string name of the id column
43
- - string name of the prediction column
44
- """
45
- expected_headers = [
46
- [ticker_col, signal_col]
47
- for ticker_col in expected_id_cols
48
- for signal_col in expected_pred_cols
49
- ]
50
- columns = submission.columns
51
- valid_headers = list(columns) in expected_headers
52
- assert (
53
- valid_headers
54
- ), f"headers must be one of {expected_id_cols} and one of {expected_pred_cols}"
55
- return columns[0], columns[1]
56
-
57
-
58
- def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
59
- return _validate_headers(
60
- NUMERAI_ALLOWED_ID_COLS, NUMERAI_ALLOWED_PRED_COLS, submission
61
- )
62
-
63
-
64
- def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
65
- return _validate_headers(
66
- SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
67
- )
68
-
69
-
70
- def validate_headers_crypto(submission: pd.DataFrame) -> Tuple[str, str]:
71
- return _validate_headers(
72
- CRYPTO_ALLOWED_ID_COLS, CRYPTO_ALLOWED_PRED_COLS, submission
73
- )
74
-
75
-
76
- def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
77
- """
78
- Validates the given submission's values are between 0 and 1 exclusive and
79
- that the submission have a non-zero standard deviation.
80
-
81
- Arguments:
82
- submission -- pandas DataFrame of the submission
83
- prediction_col -- the string name of the prediction column returned by validate_headers
84
- """
85
- assert (
86
- submission[prediction_col].isna().sum() == 0
87
- ), "submission must not contain NaNs"
88
- assert (
89
- submission[prediction_col].between(0, 1).all()
90
- ), "values must be between 0 and 1 exclusive"
91
- assert not np.isclose(
92
- 0, submission[prediction_col].std()
93
- ), "submission must have non-zero standard deviation"
94
-
95
-
96
- def _validate_ids(
97
- live_ids: pd.Series, submission: pd.DataFrame, id_col: str, min_tickers: int
98
- ) -> Tuple[pd.DataFrame, List[str]]:
99
- """
100
- Validates the given submission has no NaNs in the given id column
101
- and that the submission has a minimum number of non-duplicate ids
102
- after filtering to the live_ids.
103
-
104
- It is recommended to use one of the following functions instead of this one:
105
- - validate_ids_numerai
106
- - validate_ids_signals
107
-
108
- Arguments:
109
- live_ids -- pandas Series of the live ids or tickers from live universe
110
- submission -- pandas DataFrame of the submission
111
- id_col -- the stringn name of the column containing ids or tickers
112
-
113
- Return Tuple[pd.DataFrame, List[str]]:
114
- - submission indexed on id_col and filtered against live_ids
115
- - set of invalid tickers (diff between indexed sub and live_ids-joined sub)
116
- """
117
- assert (
118
- not submission[id_col].isna().any()
119
- ), f"Submission must not contain NaNs in the {id_col} column."
120
-
121
- index_sub = submission.copy()
122
- index_sub[id_col] = index_sub[id_col].astype(str)
123
-
124
- live_ids = live_ids.astype(str)
125
- live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
126
- assert (
127
- not live_sub[id_col].duplicated().any()
128
- ), f"Duplicates detected in {id_col} for live period."
129
-
130
- # join on live_ids and ensure min tickers reached
131
- assert (
132
- len(live_sub) >= min_tickers
133
- ), "Not enough stocks submitted. Are you using the latest live ids or live universe?"
134
-
135
- invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
136
- return live_sub, invalid_tickers
137
-
138
-
139
- def validate_ids_numerai(
140
- live_ids: pd.Series, submission: pd.DataFrame, id_col: str
141
- ) -> Tuple[pd.DataFrame, List[str]]:
142
- return _validate_ids(live_ids, submission, id_col, len(live_ids))
143
-
144
-
145
- def validate_ids_signals(
146
- live_ids: pd.Series, submission: pd.DataFrame, id_col: str
147
- ) -> Tuple[pd.DataFrame, List[str]]:
148
- return _validate_ids(live_ids, submission, id_col, SIGNALS_MIN_TICKERS)
149
-
150
-
151
- def validate_ids_crypto(
152
- live_ids: pd.Series, submission: pd.DataFrame, id_col: str
153
- ) -> Tuple[pd.DataFrame, List[str]]:
154
- return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
155
-
156
-
157
- def remap_ids(
158
- data: pd.DataFrame,
159
- ticker_map: pd.Series | pd.DataFrame,
160
- src_id_col: str,
161
- dst_id_col: str,
162
- ) -> pd.DataFrame:
163
- # first, index the universe and data on the source ids
164
- indexed_map = ticker_map.reset_index().set_index(src_id_col)
165
- indexed_data = data.set_index(src_id_col)
166
- return (
167
- # then, join the universe and data
168
- indexed_map.join(indexed_data)
169
- # get just the destination ids and prediction columns
170
- .reset_index()[[dst_id_col, *indexed_data.columns]]
171
- # finally, sort by the destination ticker column
172
- .sort_values(dst_id_col)
173
- )
174
-
175
-
176
- def clean_submission(
177
- live_ids: pd.Series | pd.DataFrame,
178
- predictions: pd.DataFrame,
179
- name: str,
180
- id_col: str,
181
- rank_and_fill: bool,
182
- tournament: int,
183
- ) -> pd.Series:
184
- """Prepare predictions for submission to Numerai.
185
- Filters out ids not in live data, drops duplicates, sets ids as index,
186
- then optionally ranks (keeping ties) and fills NaNs with 0.5.
187
-
188
- This function is used in Numerai to clean submissions for use in the
189
- Meta Model and scoring. We only rank and fill in preparation for scoring
190
- Signals and Crypto submissions.
191
-
192
- Arguments:
193
- live_ids: pd.Series - the ids in the live data
194
- predictions: pd.DataFrame - the predictions to clean
195
- name: str - the name of the submission (used for renaming)
196
- id_col: str - the column name of the ids
197
- rank_and_fill: bool - whether to rank and fill NaNs with 0.5
198
- left_join_ids: bool - whether to left join the predictions onto the ids
199
- """
200
- assert len(live_ids) > 0, "live_ids must not be empty"
201
- if isinstance(live_ids, pd.DataFrame):
202
- assert live_ids.isna().sum().sum() == 0, "live_ids must not contain NaNs"
203
- else:
204
- assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
205
- assert len(predictions) > 0, "predictions must not be empty"
206
-
207
- header_fn = {
208
- 8: validate_headers_numerai,
209
- 11: validate_headers_signals,
210
- 12: validate_headers_crypto,
211
- }
212
- assert (
213
- tournament in header_fn
214
- ), f"Unsupported tournament {tournament} for cleaning predictions"
215
- ticker_col, signal_col = header_fn[tournament](predictions)
216
-
217
- clean_preds = (
218
- remap_ids(predictions, live_ids, ticker_col, id_col)
219
- # drop NaNs and duplicates
220
- .dropna(subset=[id_col])
221
- .drop_duplicates(subset=id_col, keep="first")
222
- # set ids as index and sort
223
- .set_index(id_col)
224
- .sort_index()
225
- # rename to given name
226
- .rename(columns={signal_col: name})
227
- )[name]
228
- # rank and fill with 0.5
229
- if rank_and_fill:
230
- clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
231
- return clean_preds
232
-
233
-
234
- def clean_submission_numerai(
235
- live_ids: pd.Series, submission: pd.DataFrame, user_id: str
236
- ) -> pd.Series:
237
- return clean_submission(
238
- live_ids=live_ids,
239
- predictions=submission,
240
- name=user_id,
241
- id_col="id",
242
- rank_and_fill=True,
243
- tournament=8,
244
- )
245
-
246
-
247
- def clean_submission_signals(
248
- universe: pd.DataFrame,
249
- submission: pd.DataFrame,
250
- submission_id: str,
251
- index_col: str,
252
- rank_and_fill: bool = True,
253
- ) -> pd.Series:
254
- # drop data_type and date columns if they exist
255
- if "data_type" in submission.columns:
256
- logger.warning(
257
- "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
258
- "Please remove the data_type column from your Signals submission."
259
- )
260
- date_col = [
261
- date_col
262
- for date_col in SIGNALS_ALLOWED_DATE_COLS
263
- if date_col in list(submission.columns)
264
- ]
265
- submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
266
- return clean_submission(
267
- live_ids=universe,
268
- predictions=submission,
269
- name=submission_id,
270
- id_col=index_col,
271
- rank_and_fill=rank_and_fill,
272
- tournament=11,
273
- )
274
-
275
-
276
- def clean_submission_crypto(
277
- universe: pd.DataFrame,
278
- submission: pd.DataFrame,
279
- submission_id: str,
280
- index_col: str,
281
- rank_and_fill: bool = True,
282
- ):
283
- return clean_submission(
284
- live_ids=universe,
285
- predictions=submission,
286
- name=submission_id,
287
- id_col=index_col,
288
- rank_and_fill=rank_and_fill,
289
- tournament=12,
290
- )