numerai-tools 0.5.0.dev3__tar.gz → 0.5.0.dev4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/PKG-INFO +1 -1
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/numerai_tools/signals.py +7 -7
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/numerai_tools/submissions.py +69 -50
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/pyproject.toml +1 -1
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/LICENSE +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/README.md +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/numerai_tools/__init__.py +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/numerai_tools/py.typed +0 -0
- {numerai_tools-0.5.0.dev3 → numerai_tools-0.5.0.dev4}/numerai_tools/scoring.py +0 -0
|
@@ -7,7 +7,7 @@ from numerai_tools.scoring import (
|
|
|
7
7
|
generate_neutralized_weights,
|
|
8
8
|
)
|
|
9
9
|
from numerai_tools.submissions import (
|
|
10
|
-
|
|
10
|
+
validate_and_clean_submission_signals,
|
|
11
11
|
remap_ids,
|
|
12
12
|
)
|
|
13
13
|
|
|
@@ -106,11 +106,11 @@ def calculate_max_churn_and_turnover(
|
|
|
106
106
|
prev_week_max_turnover -- the maximum turnover from previous submissions
|
|
107
107
|
"""
|
|
108
108
|
universe = universe.reset_index()
|
|
109
|
-
curr_sub_vector =
|
|
109
|
+
curr_sub_vector = validate_and_clean_submission_signals(
|
|
110
110
|
universe=universe,
|
|
111
111
|
submission=curr_sub,
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
id_col=curr_ticker_col,
|
|
113
|
+
rename_as=curr_signal_col,
|
|
114
114
|
rank_and_fill=True,
|
|
115
115
|
)
|
|
116
116
|
churn_stats = []
|
|
@@ -122,11 +122,11 @@ def calculate_max_churn_and_turnover(
|
|
|
122
122
|
prev_sub = prev_week_subs[datestamp]
|
|
123
123
|
prev_neutralizer = prev_neutralizers[datestamp]
|
|
124
124
|
prev_weight = prev_sample_weights[datestamp]
|
|
125
|
-
filtered_prev_sub =
|
|
125
|
+
filtered_prev_sub = validate_and_clean_submission_signals(
|
|
126
126
|
universe=universe,
|
|
127
127
|
submission=prev_sub,
|
|
128
|
-
|
|
129
|
-
|
|
128
|
+
id_col=curr_ticker_col,
|
|
129
|
+
rename_as=curr_signal_col,
|
|
130
130
|
rank_and_fill=True,
|
|
131
131
|
)
|
|
132
132
|
prev_neutralizer = (
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from numerai_tools.scoring import tie_kept_rank
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Tuple, List
|
|
4
|
+
from typing import Tuple, List, Optional
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import numpy as np
|
|
@@ -49,9 +49,10 @@ def _validate_headers(
|
|
|
49
49
|
]
|
|
50
50
|
columns = submission.columns
|
|
51
51
|
valid_headers = list(columns) in expected_headers
|
|
52
|
-
assert (
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
assert valid_headers, (
|
|
53
|
+
"invalid_submission_headers: headers must be one of"
|
|
54
|
+
f" {expected_id_cols} and one of {expected_pred_cols}"
|
|
55
|
+
)
|
|
55
56
|
return columns[0], columns[1]
|
|
56
57
|
|
|
57
58
|
|
|
@@ -84,13 +85,13 @@ def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
|
|
|
84
85
|
"""
|
|
85
86
|
assert (
|
|
86
87
|
submission[prediction_col].isna().sum() == 0
|
|
87
|
-
), "submission must not contain NaNs"
|
|
88
|
+
), "invalid_submission_values: submission must not contain NaNs"
|
|
88
89
|
assert (
|
|
89
90
|
submission[prediction_col].between(0, 1).all()
|
|
90
|
-
), "values must be between 0 and 1 exclusive"
|
|
91
|
+
), "invalid_submission_values: values must be between 0 and 1 exclusive"
|
|
91
92
|
assert not np.isclose(
|
|
92
93
|
0, submission[prediction_col].std()
|
|
93
|
-
), "submission must have non-zero standard deviation"
|
|
94
|
+
), "invalid_submission_values: submission must have non-zero standard deviation"
|
|
94
95
|
|
|
95
96
|
|
|
96
97
|
def _validate_ids(
|
|
@@ -116,7 +117,7 @@ def _validate_ids(
|
|
|
116
117
|
"""
|
|
117
118
|
assert (
|
|
118
119
|
not submission[id_col].isna().any()
|
|
119
|
-
), f"Submission must not contain NaNs in the {id_col} column."
|
|
120
|
+
), f"invalid_submission_ids: Submission must not contain NaNs in the {id_col} column."
|
|
120
121
|
|
|
121
122
|
index_sub = submission.copy()
|
|
122
123
|
index_sub[id_col] = index_sub[id_col].astype(str)
|
|
@@ -125,12 +126,13 @@ def _validate_ids(
|
|
|
125
126
|
live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
|
|
126
127
|
assert (
|
|
127
128
|
not live_sub[id_col].duplicated().any()
|
|
128
|
-
), f"Duplicates detected in {id_col} for live period."
|
|
129
|
+
), f"invalid_submission_ids: Duplicates detected in {id_col} for live period."
|
|
129
130
|
|
|
130
131
|
# join on live_ids and ensure min tickers reached
|
|
131
|
-
assert (
|
|
132
|
-
|
|
133
|
-
|
|
132
|
+
assert len(live_sub) >= min_tickers, (
|
|
133
|
+
"invalid_submission_ids: Not enough stocks submitted."
|
|
134
|
+
" Are you using the latest live ids or live universe?"
|
|
135
|
+
)
|
|
134
136
|
|
|
135
137
|
invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
|
|
136
138
|
return live_sub, invalid_tickers
|
|
@@ -176,10 +178,11 @@ def remap_ids(
|
|
|
176
178
|
def clean_submission(
|
|
177
179
|
live_ids: pd.Series | pd.DataFrame,
|
|
178
180
|
predictions: pd.DataFrame,
|
|
179
|
-
|
|
181
|
+
ticker_col: str,
|
|
182
|
+
signal_col: str,
|
|
183
|
+
rename_as: Optional[str],
|
|
180
184
|
id_col: str,
|
|
181
185
|
rank_and_fill: bool,
|
|
182
|
-
tournament: int,
|
|
183
186
|
) -> pd.Series:
|
|
184
187
|
"""Prepare predictions for submission to Numerai.
|
|
185
188
|
Filters out ids not in live data, drops duplicates, sets ids as index,
|
|
@@ -192,10 +195,14 @@ def clean_submission(
|
|
|
192
195
|
Arguments:
|
|
193
196
|
live_ids: pd.Series - the ids in the live data
|
|
194
197
|
predictions: pd.DataFrame - the predictions to clean
|
|
195
|
-
|
|
198
|
+
ticker_col: str - the name of the ids column
|
|
199
|
+
signal_col: str - the name of the predictions column
|
|
200
|
+
rename_as: Optional[str] - the string to which the submission should be renamed
|
|
196
201
|
id_col: str - the column name of the ids
|
|
197
202
|
rank_and_fill: bool - whether to rank and fill NaNs with 0.5
|
|
198
|
-
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
pd.Series - the cleaned prediction series with ids as index
|
|
199
206
|
"""
|
|
200
207
|
assert len(live_ids) > 0, "live_ids must not be empty"
|
|
201
208
|
if isinstance(live_ids, pd.DataFrame):
|
|
@@ -204,16 +211,6 @@ def clean_submission(
|
|
|
204
211
|
assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
|
|
205
212
|
assert len(predictions) > 0, "predictions must not be empty"
|
|
206
213
|
|
|
207
|
-
header_fn = {
|
|
208
|
-
8: validate_headers_numerai,
|
|
209
|
-
11: validate_headers_signals,
|
|
210
|
-
12: validate_headers_crypto,
|
|
211
|
-
}
|
|
212
|
-
assert (
|
|
213
|
-
tournament in header_fn
|
|
214
|
-
), f"Unsupported tournament {tournament} for cleaning predictions"
|
|
215
|
-
ticker_col, signal_col = header_fn[tournament](predictions)
|
|
216
|
-
|
|
217
214
|
clean_preds = (
|
|
218
215
|
remap_ids(predictions, live_ids, ticker_col, id_col)
|
|
219
216
|
# drop NaNs and duplicates
|
|
@@ -223,32 +220,42 @@ def clean_submission(
|
|
|
223
220
|
.set_index(id_col)
|
|
224
221
|
.sort_index()
|
|
225
222
|
# rename to given name
|
|
226
|
-
.rename(columns={signal_col:
|
|
227
|
-
)[
|
|
223
|
+
.rename(columns={signal_col: rename_as})
|
|
224
|
+
)[rename_as]
|
|
228
225
|
# rank and fill with 0.5
|
|
229
226
|
if rank_and_fill:
|
|
230
227
|
clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
|
|
231
228
|
return clean_preds
|
|
232
229
|
|
|
233
230
|
|
|
234
|
-
def
|
|
235
|
-
|
|
231
|
+
def validate_and_clean_submission_numerai(
|
|
232
|
+
universe: pd.Series,
|
|
233
|
+
submission: pd.DataFrame,
|
|
234
|
+
id_col: str = "id",
|
|
235
|
+
rename_as: Optional[str] = None,
|
|
236
|
+
rank_and_fill: bool = False,
|
|
236
237
|
) -> pd.Series:
|
|
238
|
+
ticker_col, signal_col = validate_headers_numerai(submission)
|
|
239
|
+
filtered_sub, invalid_tickers = validate_ids_numerai(
|
|
240
|
+
universe, submission, ticker_col
|
|
241
|
+
)
|
|
242
|
+
validate_values(filtered_sub, signal_col)
|
|
237
243
|
return clean_submission(
|
|
238
|
-
live_ids=
|
|
239
|
-
predictions=
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
+
live_ids=universe,
|
|
245
|
+
predictions=filtered_sub,
|
|
246
|
+
ticker_col=ticker_col,
|
|
247
|
+
signal_col=signal_col,
|
|
248
|
+
rename_as=rename_as,
|
|
249
|
+
id_col=id_col,
|
|
250
|
+
rank_and_fill=rank_and_fill,
|
|
244
251
|
)
|
|
245
252
|
|
|
246
253
|
|
|
247
|
-
def
|
|
254
|
+
def validate_and_clean_submission_signals(
|
|
248
255
|
universe: pd.DataFrame,
|
|
249
256
|
submission: pd.DataFrame,
|
|
250
|
-
|
|
251
|
-
|
|
257
|
+
id_col: str,
|
|
258
|
+
rename_as: Optional[str] = None,
|
|
252
259
|
rank_and_fill: bool = True,
|
|
253
260
|
) -> pd.Series:
|
|
254
261
|
# drop data_type and date columns if they exist
|
|
@@ -263,28 +270,40 @@ def clean_submission_signals(
|
|
|
263
270
|
if date_col in list(submission.columns)
|
|
264
271
|
]
|
|
265
272
|
submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
|
|
273
|
+
ticker_col, signal_col = validate_headers_signals(submission)
|
|
274
|
+
filtered_sub, invalid_tickers = validate_ids_signals(
|
|
275
|
+
universe[ticker_col], submission, ticker_col
|
|
276
|
+
)
|
|
277
|
+
validate_values(filtered_sub, signal_col)
|
|
266
278
|
return clean_submission(
|
|
267
279
|
live_ids=universe,
|
|
268
|
-
predictions=
|
|
269
|
-
|
|
270
|
-
|
|
280
|
+
predictions=filtered_sub,
|
|
281
|
+
ticker_col=ticker_col,
|
|
282
|
+
signal_col=signal_col,
|
|
283
|
+
rename_as=rename_as,
|
|
284
|
+
id_col=id_col,
|
|
271
285
|
rank_and_fill=rank_and_fill,
|
|
272
|
-
tournament=11,
|
|
273
286
|
)
|
|
274
287
|
|
|
275
288
|
|
|
276
|
-
def
|
|
289
|
+
def validate_and_clean_submission_crypto(
|
|
277
290
|
universe: pd.DataFrame,
|
|
278
291
|
submission: pd.DataFrame,
|
|
279
|
-
|
|
280
|
-
|
|
292
|
+
id_col: str = "symbol",
|
|
293
|
+
rename_as: Optional[str] = None,
|
|
281
294
|
rank_and_fill: bool = True,
|
|
282
295
|
):
|
|
296
|
+
ticker_col, signal_col = validate_headers_crypto(submission)
|
|
297
|
+
filtered_sub, invalid_tickers = validate_ids_crypto(
|
|
298
|
+
universe[ticker_col], submission, ticker_col
|
|
299
|
+
)
|
|
300
|
+
validate_values(filtered_sub, signal_col)
|
|
283
301
|
return clean_submission(
|
|
284
302
|
live_ids=universe,
|
|
285
|
-
predictions=
|
|
286
|
-
|
|
287
|
-
|
|
303
|
+
predictions=filtered_sub,
|
|
304
|
+
ticker_col=ticker_col,
|
|
305
|
+
signal_col=signal_col,
|
|
306
|
+
rename_as=rename_as,
|
|
307
|
+
id_col=id_col,
|
|
288
308
|
rank_and_fill=rank_and_fill,
|
|
289
|
-
tournament=12,
|
|
290
309
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "numerai-tools"
|
|
3
|
-
version = "0.5.0.
|
|
3
|
+
version = "0.5.0.dev4"
|
|
4
4
|
description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
|
|
5
5
|
authors = [
|
|
6
6
|
{name = "Numerai Engineering",email = "engineering@numer.ai"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|