numerai-tools 0.5.0.dev3__tar.gz → 0.5.0.dev5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/PKG-INFO +1 -1
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/numerai_tools/signals.py +28 -7
- numerai_tools-0.5.0.dev5/numerai_tools/submissions.py +335 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/pyproject.toml +1 -1
- numerai_tools-0.5.0.dev3/numerai_tools/submissions.py +0 -290
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/LICENSE +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/README.md +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/numerai_tools/__init__.py +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/numerai_tools/py.typed +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev5}/numerai_tools/scoring.py +0 -0
|
@@ -7,7 +7,8 @@ from numerai_tools.scoring import (
|
|
|
7
7
|
generate_neutralized_weights,
|
|
8
8
|
)
|
|
9
9
|
from numerai_tools.submissions import (
|
|
10
|
-
|
|
10
|
+
validate_submission_signals,
|
|
11
|
+
clean_submission,
|
|
11
12
|
remap_ids,
|
|
12
13
|
)
|
|
13
14
|
|
|
@@ -106,11 +107,20 @@ def calculate_max_churn_and_turnover(
|
|
|
106
107
|
prev_week_max_turnover -- the maximum turnover from previous submissions
|
|
107
108
|
"""
|
|
108
109
|
universe = universe.reset_index()
|
|
109
|
-
|
|
110
|
+
(
|
|
111
|
+
curr_ticker_col,
|
|
112
|
+
curr_signal_col,
|
|
113
|
+
curr_sub,
|
|
114
|
+
_,
|
|
115
|
+
) = validate_submission_signals(
|
|
110
116
|
universe=universe,
|
|
111
117
|
submission=curr_sub,
|
|
112
|
-
|
|
113
|
-
|
|
118
|
+
)
|
|
119
|
+
curr_sub_vector = clean_submission(
|
|
120
|
+
universe=universe,
|
|
121
|
+
submission=curr_sub,
|
|
122
|
+
src_id_col=curr_ticker_col,
|
|
123
|
+
src_signal_col=curr_signal_col,
|
|
114
124
|
rank_and_fill=True,
|
|
115
125
|
)
|
|
116
126
|
churn_stats = []
|
|
@@ -122,11 +132,22 @@ def calculate_max_churn_and_turnover(
|
|
|
122
132
|
prev_sub = prev_week_subs[datestamp]
|
|
123
133
|
prev_neutralizer = prev_neutralizers[datestamp]
|
|
124
134
|
prev_weight = prev_sample_weights[datestamp]
|
|
125
|
-
|
|
135
|
+
(
|
|
136
|
+
prev_ticker_col,
|
|
137
|
+
prev_signal_col,
|
|
138
|
+
prev_sub,
|
|
139
|
+
_,
|
|
140
|
+
) = validate_submission_signals(
|
|
141
|
+
universe=universe,
|
|
142
|
+
submission=prev_sub,
|
|
143
|
+
)
|
|
144
|
+
filtered_prev_sub = clean_submission(
|
|
126
145
|
universe=universe,
|
|
127
146
|
submission=prev_sub,
|
|
128
|
-
|
|
129
|
-
|
|
147
|
+
src_id_col=prev_ticker_col,
|
|
148
|
+
src_signal_col=prev_signal_col,
|
|
149
|
+
dst_id_col=curr_ticker_col,
|
|
150
|
+
dst_signal_col=curr_signal_col,
|
|
130
151
|
rank_and_fill=True,
|
|
131
152
|
)
|
|
132
153
|
prev_neutralizer = (
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
from numerai_tools.scoring import tie_kept_rank
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Tuple, List, Optional
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
NUMERAI_ALLOWED_ID_COLS = ["id"]
|
|
10
|
+
NUMERAI_ALLOWED_PRED_COLS = ["prediction", "probability"]
|
|
11
|
+
|
|
12
|
+
SIGNALS_ALLOWED_ID_COLS = [
|
|
13
|
+
"ticker",
|
|
14
|
+
"sedol",
|
|
15
|
+
"bloomberg_ticker",
|
|
16
|
+
"composite_figi",
|
|
17
|
+
"numerai_ticker",
|
|
18
|
+
]
|
|
19
|
+
SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
|
|
20
|
+
SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
|
|
21
|
+
SIGNALS_MIN_TICKERS = 100
|
|
22
|
+
|
|
23
|
+
CRYPTO_ALLOWED_ID_COLS = ["symbol"]
|
|
24
|
+
CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
|
|
25
|
+
CRYPTO_MIN_TICKERS = 100
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _validate_headers(
|
|
31
|
+
submission: pd.DataFrame,
|
|
32
|
+
expected_id_cols: List[str],
|
|
33
|
+
expected_pred_cols: List[str],
|
|
34
|
+
other_cols: Optional[List[str]] = None,
|
|
35
|
+
) -> Tuple[str, str]:
|
|
36
|
+
"""Validate the given submission has the right headers.
|
|
37
|
+
It is recommended to use one of the following functions instead of this one:
|
|
38
|
+
- validate_headers_numerai
|
|
39
|
+
- validate_headers_signals
|
|
40
|
+
|
|
41
|
+
Arguments:
|
|
42
|
+
submission -- pandas DataFrame of the submission
|
|
43
|
+
expected_id_cols -- list of expected id columns
|
|
44
|
+
expected_pred_cols -- list of expected prediction columns
|
|
45
|
+
other_cols -- optional list of other columns that can be present in the submission
|
|
46
|
+
|
|
47
|
+
Return Tuple[str, str]:
|
|
48
|
+
- string name of the id column
|
|
49
|
+
- string name of the prediction column
|
|
50
|
+
"""
|
|
51
|
+
expected_headers = [
|
|
52
|
+
[ticker_col, signal_col]
|
|
53
|
+
for ticker_col in expected_id_cols
|
|
54
|
+
for signal_col in expected_pred_cols
|
|
55
|
+
]
|
|
56
|
+
if other_cols is not None:
|
|
57
|
+
expected_headers += [
|
|
58
|
+
[ticker_col, signal_col, other_col]
|
|
59
|
+
for ticker_col in expected_id_cols
|
|
60
|
+
for signal_col in expected_pred_cols
|
|
61
|
+
for other_col in other_cols
|
|
62
|
+
]
|
|
63
|
+
columns = submission.columns
|
|
64
|
+
valid_headers = list(columns) in expected_headers
|
|
65
|
+
assert valid_headers, (
|
|
66
|
+
"invalid_submission_headers: headers must be one of"
|
|
67
|
+
f" {expected_id_cols} and one of {expected_pred_cols}"
|
|
68
|
+
)
|
|
69
|
+
return columns[0], columns[1]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
73
|
+
return _validate_headers(
|
|
74
|
+
submission,
|
|
75
|
+
NUMERAI_ALLOWED_ID_COLS,
|
|
76
|
+
NUMERAI_ALLOWED_PRED_COLS,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
81
|
+
return _validate_headers(
|
|
82
|
+
submission,
|
|
83
|
+
SIGNALS_ALLOWED_ID_COLS,
|
|
84
|
+
SIGNALS_ALLOWED_PRED_COLS,
|
|
85
|
+
SIGNALS_ALLOWED_DATE_COLS,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def validate_headers_crypto(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
90
|
+
return _validate_headers(
|
|
91
|
+
submission,
|
|
92
|
+
CRYPTO_ALLOWED_ID_COLS,
|
|
93
|
+
CRYPTO_ALLOWED_PRED_COLS,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
|
|
98
|
+
"""
|
|
99
|
+
Validates the given submission's values are between 0 and 1 exclusive and
|
|
100
|
+
that the submission have a non-zero standard deviation.
|
|
101
|
+
|
|
102
|
+
Arguments:
|
|
103
|
+
submission -- pandas DataFrame of the submission
|
|
104
|
+
prediction_col -- the string name of the prediction column returned by validate_headers
|
|
105
|
+
"""
|
|
106
|
+
assert (
|
|
107
|
+
submission[prediction_col].isna().sum() == 0
|
|
108
|
+
), "invalid_submission_values: submission must not contain NaNs"
|
|
109
|
+
assert (
|
|
110
|
+
submission[prediction_col].between(0, 1).all()
|
|
111
|
+
), "invalid_submission_values: values must be between 0 and 1 exclusive"
|
|
112
|
+
assert not np.isclose(
|
|
113
|
+
0, submission[prediction_col].std()
|
|
114
|
+
), "invalid_submission_values: submission must have non-zero standard deviation"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _validate_ids(
|
|
118
|
+
live_ids: pd.Series, submission: pd.DataFrame, id_col: str, min_tickers: int
|
|
119
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
120
|
+
"""
|
|
121
|
+
Validates the given submission has no NaNs in the given id column
|
|
122
|
+
and that the submission has a minimum number of non-duplicate ids
|
|
123
|
+
after filtering to the live_ids.
|
|
124
|
+
|
|
125
|
+
It is recommended to use one of the following functions instead of this one:
|
|
126
|
+
- validate_ids_numerai
|
|
127
|
+
- validate_ids_signals
|
|
128
|
+
|
|
129
|
+
Arguments:
|
|
130
|
+
live_ids -- pandas Series of the live ids or tickers from live universe
|
|
131
|
+
submission -- pandas DataFrame of the submission
|
|
132
|
+
id_col -- the stringn name of the column containing ids or tickers
|
|
133
|
+
|
|
134
|
+
Return Tuple[pd.DataFrame, List[str]]:
|
|
135
|
+
- submission indexed on id_col and filtered against live_ids
|
|
136
|
+
- set of invalid tickers (diff between indexed sub and live_ids-joined sub)
|
|
137
|
+
"""
|
|
138
|
+
assert (
|
|
139
|
+
not submission[id_col].isna().any()
|
|
140
|
+
), f"invalid_submission_ids: Submission must not contain NaNs in the {id_col} column."
|
|
141
|
+
|
|
142
|
+
index_sub = submission.copy()
|
|
143
|
+
index_sub[id_col] = index_sub[id_col].astype(str)
|
|
144
|
+
|
|
145
|
+
live_ids = live_ids.astype(str)
|
|
146
|
+
live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
|
|
147
|
+
assert (
|
|
148
|
+
not live_sub[id_col].duplicated().any()
|
|
149
|
+
), f"invalid_submission_ids: Duplicates detected in {id_col} for live period."
|
|
150
|
+
|
|
151
|
+
# join on live_ids and ensure min tickers reached
|
|
152
|
+
assert len(live_sub) >= min_tickers, (
|
|
153
|
+
"invalid_submission_ids: Not enough stocks submitted."
|
|
154
|
+
" Are you using the latest live ids or live universe?"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
|
|
158
|
+
return live_sub, invalid_tickers
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def validate_ids_numerai(
|
|
162
|
+
live_ids: pd.Series, submission: pd.DataFrame, id_col: str
|
|
163
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
164
|
+
return _validate_ids(live_ids, submission, id_col, len(live_ids))
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def validate_ids_signals(
|
|
168
|
+
live_ids: pd.Series, submission: pd.DataFrame, id_col: str
|
|
169
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
170
|
+
return _validate_ids(live_ids, submission, id_col, SIGNALS_MIN_TICKERS)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def validate_ids_crypto(
|
|
174
|
+
live_ids: pd.Series, submission: pd.DataFrame, id_col: str
|
|
175
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
|
176
|
+
return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def validate_submission_numerai(
|
|
180
|
+
universe: pd.Series, submission: pd.DataFrame
|
|
181
|
+
) -> Tuple[str, str, pd.DataFrame, List[str]]:
|
|
182
|
+
"""Validate the headers, ids, and values for a submission.
|
|
183
|
+
|
|
184
|
+
Arguments:
|
|
185
|
+
universe: pd.DataFrame - the live universe of ids on which the predictions are based
|
|
186
|
+
submission: pd.DataFrame - the predictions to validate
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
|
|
190
|
+
filtered submission, and list of invalid tickers
|
|
191
|
+
"""
|
|
192
|
+
ticker_col, signal_col = validate_headers_numerai(submission)
|
|
193
|
+
filtered_sub, invalid_tickers = validate_ids_numerai(
|
|
194
|
+
universe, submission, ticker_col
|
|
195
|
+
)
|
|
196
|
+
validate_values(filtered_sub, signal_col)
|
|
197
|
+
return ticker_col, signal_col, filtered_sub, invalid_tickers
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def validate_submission_signals(
|
|
201
|
+
universe: pd.DataFrame, submission: pd.DataFrame
|
|
202
|
+
) -> Tuple[str, str, pd.DataFrame, List[str]]:
|
|
203
|
+
"""Validate the headers, ids, and values for a submission.
|
|
204
|
+
|
|
205
|
+
Arguments:
|
|
206
|
+
universe: pd.DataFrame - the live universe of ids on which the predictions are based
|
|
207
|
+
submission: pd.DataFrame - the predictions to validate
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
|
|
211
|
+
filtered submission, and list of invalid tickers
|
|
212
|
+
"""
|
|
213
|
+
# drop data_type and date columns if they exist
|
|
214
|
+
if "data_type" in submission.columns:
|
|
215
|
+
logger.warning(
|
|
216
|
+
"data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
|
|
217
|
+
"Please remove the data_type column from your Signals submission."
|
|
218
|
+
)
|
|
219
|
+
submission = submission.drop(columns=["data_type"], errors="ignore")
|
|
220
|
+
ticker_col, signal_col = validate_headers_signals(submission)
|
|
221
|
+
filtered_sub, invalid_tickers = validate_ids_signals(
|
|
222
|
+
universe[ticker_col], submission, ticker_col
|
|
223
|
+
)
|
|
224
|
+
validate_values(filtered_sub, signal_col)
|
|
225
|
+
return ticker_col, signal_col, filtered_sub, invalid_tickers
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def validate_submission_crypto(
|
|
229
|
+
universe: pd.DataFrame, submission: pd.DataFrame
|
|
230
|
+
) -> Tuple[str, str, pd.DataFrame, List[str]]:
|
|
231
|
+
"""Validate the headers, ids, and values for a submission.
|
|
232
|
+
|
|
233
|
+
Arguments:
|
|
234
|
+
universe: pd.DataFrame - the live universe of ids on which the predictions are based
|
|
235
|
+
submission: pd.DataFrame - the predictions to validate
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
|
|
239
|
+
filtered submission, and list of invalid tickers
|
|
240
|
+
"""
|
|
241
|
+
print(universe)
|
|
242
|
+
ticker_col, signal_col = validate_headers_crypto(submission)
|
|
243
|
+
filtered_sub, invalid_tickers = validate_ids_crypto(
|
|
244
|
+
universe[ticker_col], submission, ticker_col
|
|
245
|
+
)
|
|
246
|
+
validate_values(filtered_sub, signal_col)
|
|
247
|
+
return ticker_col, signal_col, filtered_sub, invalid_tickers
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def remap_ids(
|
|
251
|
+
data: pd.DataFrame,
|
|
252
|
+
ticker_map: pd.Series | pd.DataFrame,
|
|
253
|
+
src_id_col: str,
|
|
254
|
+
dst_id_col: str,
|
|
255
|
+
) -> pd.DataFrame:
|
|
256
|
+
"""Join the data to the ticker map based on source ids
|
|
257
|
+
and remap to the destination ids. If the ticker is a Series, it is assumed that
|
|
258
|
+
src_id_col and dst_id_col are the same, and the ticker map is simply used to
|
|
259
|
+
ensure the data has all ids in the ticker map.
|
|
260
|
+
|
|
261
|
+
Arguments:
|
|
262
|
+
data: pd.DataFrame - the data to remap
|
|
263
|
+
ticker_map: pd.Series | pd.DataFrame - the mapping of source ids to destination ids
|
|
264
|
+
src_id_col: str - the name of the source ids column in the data
|
|
265
|
+
dst_id_col: str - the name of the destination ids column in the ticker map
|
|
266
|
+
"""
|
|
267
|
+
# first, index the universe and data on the source ids
|
|
268
|
+
indexed_map = ticker_map.reset_index().set_index(src_id_col)
|
|
269
|
+
indexed_data = data.set_index(src_id_col)
|
|
270
|
+
return (
|
|
271
|
+
# then, join the universe and data
|
|
272
|
+
indexed_map.join(indexed_data)
|
|
273
|
+
# get just the destination ids and prediction columns
|
|
274
|
+
.reset_index()[[dst_id_col, *indexed_data.columns]]
|
|
275
|
+
# finally, sort by the destination ticker column
|
|
276
|
+
.sort_values(dst_id_col)
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def clean_submission(
|
|
281
|
+
universe: pd.Series | pd.DataFrame,
|
|
282
|
+
submission: pd.DataFrame,
|
|
283
|
+
src_id_col: str,
|
|
284
|
+
src_signal_col: str,
|
|
285
|
+
dst_id_col: Optional[str] = None,
|
|
286
|
+
dst_signal_col: Optional[str] = None,
|
|
287
|
+
rank_and_fill: bool = False,
|
|
288
|
+
) -> pd.Series:
|
|
289
|
+
"""Prepares your submission for uploading to a Numerai tournament.
|
|
290
|
+
Joins your submission to the universe, remaps ids as neded, drops
|
|
291
|
+
duplicates, sets ids as index, renames the series, then optionally
|
|
292
|
+
tie-kept ranks and fills NaNs with 0.5.
|
|
293
|
+
|
|
294
|
+
This function is used in Numerai to clean submissions for use in the
|
|
295
|
+
Meta Model and scoring. We rank and fill submissions before scoring.
|
|
296
|
+
|
|
297
|
+
Arguments:
|
|
298
|
+
universe: pd.Series - the live universe of ids on which the predictions are based
|
|
299
|
+
submission: pd.DataFrame - the submission to clean
|
|
300
|
+
src_id_col: str - the name of the ids column
|
|
301
|
+
src_signal_col: str - the name of the predictions column
|
|
302
|
+
dst_id_col: Optional[str] - optional name of the id column to map the ids to
|
|
303
|
+
dst_signal_col: Optional[str] - optional name of the signal column to rename the submission to
|
|
304
|
+
rank_and_fill: bool - whether to call tie_kept_rank and then fill NaNs with 0.5
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
pd.Series - the cleaned, properly indexed submission
|
|
308
|
+
"""
|
|
309
|
+
assert len(universe) > 0, "universe must not be empty"
|
|
310
|
+
if isinstance(universe, pd.DataFrame):
|
|
311
|
+
assert universe.isna().sum().sum() == 0, "universe must not contain NaNs"
|
|
312
|
+
else:
|
|
313
|
+
assert universe.isna().sum() == 0, "universe must not contain NaNs"
|
|
314
|
+
assert len(submission) > 0, "predictions must not be empty"
|
|
315
|
+
|
|
316
|
+
if dst_id_col is None:
|
|
317
|
+
dst_id_col = src_id_col
|
|
318
|
+
if dst_signal_col is None:
|
|
319
|
+
dst_signal_col = src_signal_col
|
|
320
|
+
|
|
321
|
+
clean_preds = (
|
|
322
|
+
remap_ids(submission, universe, src_id_col, dst_id_col)
|
|
323
|
+
# drop NaNs and duplicates
|
|
324
|
+
.dropna(subset=[dst_id_col])
|
|
325
|
+
.drop_duplicates(subset=dst_id_col, keep="first")
|
|
326
|
+
# set ids as index and sort
|
|
327
|
+
.set_index(dst_id_col)
|
|
328
|
+
.sort_index()
|
|
329
|
+
# rename to given name
|
|
330
|
+
.rename(columns={src_signal_col: dst_signal_col})
|
|
331
|
+
)[dst_signal_col]
|
|
332
|
+
# rank and fill with 0.5
|
|
333
|
+
if rank_and_fill:
|
|
334
|
+
clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
|
|
335
|
+
return clean_preds
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "numerai-tools"
|
|
3
|
-
version = "0.5.0.
|
|
3
|
+
version = "0.5.0.dev5"
|
|
4
4
|
description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Numerai Engineering",email = "engineering@numer.ai"}
|
|
@@ -1,290 +0,0 @@
|
|
|
1
|
-
from numerai_tools.scoring import tie_kept_rank
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from typing import Tuple, List
|
|
5
|
-
|
|
6
|
-
import pandas as pd
|
|
7
|
-
import numpy as np
|
|
8
|
-
|
|
9
|
-
NUMERAI_ALLOWED_ID_COLS = ["id"]
|
|
10
|
-
NUMERAI_ALLOWED_PRED_COLS = ["prediction", "probability"]
|
|
11
|
-
|
|
12
|
-
SIGNALS_ALLOWED_ID_COLS = [
|
|
13
|
-
"ticker",
|
|
14
|
-
"sedol",
|
|
15
|
-
"bloomberg_ticker",
|
|
16
|
-
"composite_figi",
|
|
17
|
-
"numerai_ticker",
|
|
18
|
-
]
|
|
19
|
-
SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
|
|
20
|
-
SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
|
|
21
|
-
SIGNALS_MIN_TICKERS = 100
|
|
22
|
-
|
|
23
|
-
CRYPTO_ALLOWED_ID_COLS = ["symbol"]
|
|
24
|
-
CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
|
|
25
|
-
CRYPTO_MIN_TICKERS = 100
|
|
26
|
-
|
|
27
|
-
logger = logging.getLogger(__name__)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def _validate_headers(
|
|
31
|
-
expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
|
|
32
|
-
) -> Tuple[str, str]:
|
|
33
|
-
"""Validate the given submission has the right headers.
|
|
34
|
-
It is recommended to use one of the following functions instead of this one:
|
|
35
|
-
- validate_headers_numerai
|
|
36
|
-
- validate_headers_signals
|
|
37
|
-
|
|
38
|
-
Arguments:
|
|
39
|
-
submission -- pandas DataFrame of the submission
|
|
40
|
-
|
|
41
|
-
Return Tuple[str, str]:
|
|
42
|
-
- string name of the id column
|
|
43
|
-
- string name of the prediction column
|
|
44
|
-
"""
|
|
45
|
-
expected_headers = [
|
|
46
|
-
[ticker_col, signal_col]
|
|
47
|
-
for ticker_col in expected_id_cols
|
|
48
|
-
for signal_col in expected_pred_cols
|
|
49
|
-
]
|
|
50
|
-
columns = submission.columns
|
|
51
|
-
valid_headers = list(columns) in expected_headers
|
|
52
|
-
assert (
|
|
53
|
-
valid_headers
|
|
54
|
-
), f"headers must be one of {expected_id_cols} and one of {expected_pred_cols}"
|
|
55
|
-
return columns[0], columns[1]
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
59
|
-
return _validate_headers(
|
|
60
|
-
NUMERAI_ALLOWED_ID_COLS, NUMERAI_ALLOWED_PRED_COLS, submission
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
65
|
-
return _validate_headers(
|
|
66
|
-
SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def validate_headers_crypto(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
71
|
-
return _validate_headers(
|
|
72
|
-
CRYPTO_ALLOWED_ID_COLS, CRYPTO_ALLOWED_PRED_COLS, submission
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
|
|
77
|
-
"""
|
|
78
|
-
Validates the given submission's values are between 0 and 1 exclusive and
|
|
79
|
-
that the submission have a non-zero standard deviation.
|
|
80
|
-
|
|
81
|
-
Arguments:
|
|
82
|
-
submission -- pandas DataFrame of the submission
|
|
83
|
-
prediction_col -- the string name of the prediction column returned by validate_headers
|
|
84
|
-
"""
|
|
85
|
-
assert (
|
|
86
|
-
submission[prediction_col].isna().sum() == 0
|
|
87
|
-
), "submission must not contain NaNs"
|
|
88
|
-
assert (
|
|
89
|
-
submission[prediction_col].between(0, 1).all()
|
|
90
|
-
), "values must be between 0 and 1 exclusive"
|
|
91
|
-
assert not np.isclose(
|
|
92
|
-
0, submission[prediction_col].std()
|
|
93
|
-
), "submission must have non-zero standard deviation"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def _validate_ids(
|
|
97
|
-
live_ids: pd.Series, submission: pd.DataFrame, id_col: str, min_tickers: int
|
|
98
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
99
|
-
"""
|
|
100
|
-
Validates the given submission has no NaNs in the given id column
|
|
101
|
-
and that the submission has a minimum number of non-duplicate ids
|
|
102
|
-
after filtering to the live_ids.
|
|
103
|
-
|
|
104
|
-
It is recommended to use one of the following functions instead of this one:
|
|
105
|
-
- validate_ids_numerai
|
|
106
|
-
- validate_ids_signals
|
|
107
|
-
|
|
108
|
-
Arguments:
|
|
109
|
-
live_ids -- pandas Series of the live ids or tickers from live universe
|
|
110
|
-
submission -- pandas DataFrame of the submission
|
|
111
|
-
id_col -- the stringn name of the column containing ids or tickers
|
|
112
|
-
|
|
113
|
-
Return Tuple[pd.DataFrame, List[str]]:
|
|
114
|
-
- submission indexed on id_col and filtered against live_ids
|
|
115
|
-
- set of invalid tickers (diff between indexed sub and live_ids-joined sub)
|
|
116
|
-
"""
|
|
117
|
-
assert (
|
|
118
|
-
not submission[id_col].isna().any()
|
|
119
|
-
), f"Submission must not contain NaNs in the {id_col} column."
|
|
120
|
-
|
|
121
|
-
index_sub = submission.copy()
|
|
122
|
-
index_sub[id_col] = index_sub[id_col].astype(str)
|
|
123
|
-
|
|
124
|
-
live_ids = live_ids.astype(str)
|
|
125
|
-
live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
|
|
126
|
-
assert (
|
|
127
|
-
not live_sub[id_col].duplicated().any()
|
|
128
|
-
), f"Duplicates detected in {id_col} for live period."
|
|
129
|
-
|
|
130
|
-
# join on live_ids and ensure min tickers reached
|
|
131
|
-
assert (
|
|
132
|
-
len(live_sub) >= min_tickers
|
|
133
|
-
), "Not enough stocks submitted. Are you using the latest live ids or live universe?"
|
|
134
|
-
|
|
135
|
-
invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
|
|
136
|
-
return live_sub, invalid_tickers
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def validate_ids_numerai(
|
|
140
|
-
live_ids: pd.Series, submission: pd.DataFrame, id_col: str
|
|
141
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
142
|
-
return _validate_ids(live_ids, submission, id_col, len(live_ids))
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def validate_ids_signals(
|
|
146
|
-
live_ids: pd.Series, submission: pd.DataFrame, id_col: str
|
|
147
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
148
|
-
return _validate_ids(live_ids, submission, id_col, SIGNALS_MIN_TICKERS)
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
def validate_ids_crypto(
|
|
152
|
-
live_ids: pd.Series, submission: pd.DataFrame, id_col: str
|
|
153
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
154
|
-
return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def remap_ids(
|
|
158
|
-
data: pd.DataFrame,
|
|
159
|
-
ticker_map: pd.Series | pd.DataFrame,
|
|
160
|
-
src_id_col: str,
|
|
161
|
-
dst_id_col: str,
|
|
162
|
-
) -> pd.DataFrame:
|
|
163
|
-
# first, index the universe and data on the source ids
|
|
164
|
-
indexed_map = ticker_map.reset_index().set_index(src_id_col)
|
|
165
|
-
indexed_data = data.set_index(src_id_col)
|
|
166
|
-
return (
|
|
167
|
-
# then, join the universe and data
|
|
168
|
-
indexed_map.join(indexed_data)
|
|
169
|
-
# get just the destination ids and prediction columns
|
|
170
|
-
.reset_index()[[dst_id_col, *indexed_data.columns]]
|
|
171
|
-
# finally, sort by the destination ticker column
|
|
172
|
-
.sort_values(dst_id_col)
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def clean_submission(
|
|
177
|
-
live_ids: pd.Series | pd.DataFrame,
|
|
178
|
-
predictions: pd.DataFrame,
|
|
179
|
-
name: str,
|
|
180
|
-
id_col: str,
|
|
181
|
-
rank_and_fill: bool,
|
|
182
|
-
tournament: int,
|
|
183
|
-
) -> pd.Series:
|
|
184
|
-
"""Prepare predictions for submission to Numerai.
|
|
185
|
-
Filters out ids not in live data, drops duplicates, sets ids as index,
|
|
186
|
-
then optionally ranks (keeping ties) and fills NaNs with 0.5.
|
|
187
|
-
|
|
188
|
-
This function is used in Numerai to clean submissions for use in the
|
|
189
|
-
Meta Model and scoring. We only rank and fill in preparation for scoring
|
|
190
|
-
Signals and Crypto submissions.
|
|
191
|
-
|
|
192
|
-
Arguments:
|
|
193
|
-
live_ids: pd.Series - the ids in the live data
|
|
194
|
-
predictions: pd.DataFrame - the predictions to clean
|
|
195
|
-
name: str - the name of the submission (used for renaming)
|
|
196
|
-
id_col: str - the column name of the ids
|
|
197
|
-
rank_and_fill: bool - whether to rank and fill NaNs with 0.5
|
|
198
|
-
left_join_ids: bool - whether to left join the predictions onto the ids
|
|
199
|
-
"""
|
|
200
|
-
assert len(live_ids) > 0, "live_ids must not be empty"
|
|
201
|
-
if isinstance(live_ids, pd.DataFrame):
|
|
202
|
-
assert live_ids.isna().sum().sum() == 0, "live_ids must not contain NaNs"
|
|
203
|
-
else:
|
|
204
|
-
assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
|
|
205
|
-
assert len(predictions) > 0, "predictions must not be empty"
|
|
206
|
-
|
|
207
|
-
header_fn = {
|
|
208
|
-
8: validate_headers_numerai,
|
|
209
|
-
11: validate_headers_signals,
|
|
210
|
-
12: validate_headers_crypto,
|
|
211
|
-
}
|
|
212
|
-
assert (
|
|
213
|
-
tournament in header_fn
|
|
214
|
-
), f"Unsupported tournament {tournament} for cleaning predictions"
|
|
215
|
-
ticker_col, signal_col = header_fn[tournament](predictions)
|
|
216
|
-
|
|
217
|
-
clean_preds = (
|
|
218
|
-
remap_ids(predictions, live_ids, ticker_col, id_col)
|
|
219
|
-
# drop NaNs and duplicates
|
|
220
|
-
.dropna(subset=[id_col])
|
|
221
|
-
.drop_duplicates(subset=id_col, keep="first")
|
|
222
|
-
# set ids as index and sort
|
|
223
|
-
.set_index(id_col)
|
|
224
|
-
.sort_index()
|
|
225
|
-
# rename to given name
|
|
226
|
-
.rename(columns={signal_col: name})
|
|
227
|
-
)[name]
|
|
228
|
-
# rank and fill with 0.5
|
|
229
|
-
if rank_and_fill:
|
|
230
|
-
clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
|
|
231
|
-
return clean_preds
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
def clean_submission_numerai(
|
|
235
|
-
live_ids: pd.Series, submission: pd.DataFrame, user_id: str
|
|
236
|
-
) -> pd.Series:
|
|
237
|
-
return clean_submission(
|
|
238
|
-
live_ids=live_ids,
|
|
239
|
-
predictions=submission,
|
|
240
|
-
name=user_id,
|
|
241
|
-
id_col="id",
|
|
242
|
-
rank_and_fill=True,
|
|
243
|
-
tournament=8,
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
def clean_submission_signals(
|
|
248
|
-
universe: pd.DataFrame,
|
|
249
|
-
submission: pd.DataFrame,
|
|
250
|
-
submission_id: str,
|
|
251
|
-
index_col: str,
|
|
252
|
-
rank_and_fill: bool = True,
|
|
253
|
-
) -> pd.Series:
|
|
254
|
-
# drop data_type and date columns if they exist
|
|
255
|
-
if "data_type" in submission.columns:
|
|
256
|
-
logger.warning(
|
|
257
|
-
"data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
|
|
258
|
-
"Please remove the data_type column from your Signals submission."
|
|
259
|
-
)
|
|
260
|
-
date_col = [
|
|
261
|
-
date_col
|
|
262
|
-
for date_col in SIGNALS_ALLOWED_DATE_COLS
|
|
263
|
-
if date_col in list(submission.columns)
|
|
264
|
-
]
|
|
265
|
-
submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
|
|
266
|
-
return clean_submission(
|
|
267
|
-
live_ids=universe,
|
|
268
|
-
predictions=submission,
|
|
269
|
-
name=submission_id,
|
|
270
|
-
id_col=index_col,
|
|
271
|
-
rank_and_fill=rank_and_fill,
|
|
272
|
-
tournament=11,
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
def clean_submission_crypto(
|
|
277
|
-
universe: pd.DataFrame,
|
|
278
|
-
submission: pd.DataFrame,
|
|
279
|
-
submission_id: str,
|
|
280
|
-
index_col: str,
|
|
281
|
-
rank_and_fill: bool = True,
|
|
282
|
-
):
|
|
283
|
-
return clean_submission(
|
|
284
|
-
live_ids=universe,
|
|
285
|
-
predictions=submission,
|
|
286
|
-
name=submission_id,
|
|
287
|
-
id_col=index_col,
|
|
288
|
-
rank_and_fill=rank_and_fill,
|
|
289
|
-
tournament=12,
|
|
290
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|