numerai-tools 0.5.0.dev3__tar.gz → 0.5.0.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev3
3
+ Version: 0.5.0.dev4
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -7,7 +7,7 @@ from numerai_tools.scoring import (
7
7
  generate_neutralized_weights,
8
8
  )
9
9
  from numerai_tools.submissions import (
10
- clean_submission_signals,
10
+ validate_and_clean_submission_signals,
11
11
  remap_ids,
12
12
  )
13
13
 
@@ -106,11 +106,11 @@ def calculate_max_churn_and_turnover(
106
106
  prev_week_max_turnover -- the maximum turnover from previous submissions
107
107
  """
108
108
  universe = universe.reset_index()
109
- curr_sub_vector = clean_submission_signals(
109
+ curr_sub_vector = validate_and_clean_submission_signals(
110
110
  universe=universe,
111
111
  submission=curr_sub,
112
- submission_id=curr_signal_col,
113
- index_col=curr_ticker_col,
112
+ id_col=curr_ticker_col,
113
+ rename_as=curr_signal_col,
114
114
  rank_and_fill=True,
115
115
  )
116
116
  churn_stats = []
@@ -122,11 +122,11 @@ def calculate_max_churn_and_turnover(
122
122
  prev_sub = prev_week_subs[datestamp]
123
123
  prev_neutralizer = prev_neutralizers[datestamp]
124
124
  prev_weight = prev_sample_weights[datestamp]
125
- filtered_prev_sub = clean_submission_signals(
125
+ filtered_prev_sub = validate_and_clean_submission_signals(
126
126
  universe=universe,
127
127
  submission=prev_sub,
128
- submission_id=curr_signal_col,
129
- index_col=curr_ticker_col,
128
+ id_col=curr_ticker_col,
129
+ rename_as=curr_signal_col,
130
130
  rank_and_fill=True,
131
131
  )
132
132
  prev_neutralizer = (
@@ -1,7 +1,7 @@
1
1
  from numerai_tools.scoring import tie_kept_rank
2
2
 
3
3
  import logging
4
- from typing import Tuple, List
4
+ from typing import Tuple, List, Optional
5
5
 
6
6
  import pandas as pd
7
7
  import numpy as np
@@ -49,9 +49,10 @@ def _validate_headers(
49
49
  ]
50
50
  columns = submission.columns
51
51
  valid_headers = list(columns) in expected_headers
52
- assert (
53
- valid_headers
54
- ), f"headers must be one of {expected_id_cols} and one of {expected_pred_cols}"
52
+ assert valid_headers, (
53
+ "invalid_submission_headers: headers must be one of"
54
+ f" {expected_id_cols} and one of {expected_pred_cols}"
55
+ )
55
56
  return columns[0], columns[1]
56
57
 
57
58
 
@@ -84,13 +85,13 @@ def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
84
85
  """
85
86
  assert (
86
87
  submission[prediction_col].isna().sum() == 0
87
- ), "submission must not contain NaNs"
88
+ ), "invalid_submission_values: submission must not contain NaNs"
88
89
  assert (
89
90
  submission[prediction_col].between(0, 1).all()
90
- ), "values must be between 0 and 1 exclusive"
91
+ ), "invalid_submission_values: values must be between 0 and 1 exclusive"
91
92
  assert not np.isclose(
92
93
  0, submission[prediction_col].std()
93
- ), "submission must have non-zero standard deviation"
94
+ ), "invalid_submission_values: submission must have non-zero standard deviation"
94
95
 
95
96
 
96
97
  def _validate_ids(
@@ -116,7 +117,7 @@ def _validate_ids(
116
117
  """
117
118
  assert (
118
119
  not submission[id_col].isna().any()
119
- ), f"Submission must not contain NaNs in the {id_col} column."
120
+ ), f"invalid_submission_ids: Submission must not contain NaNs in the {id_col} column."
120
121
 
121
122
  index_sub = submission.copy()
122
123
  index_sub[id_col] = index_sub[id_col].astype(str)
@@ -125,12 +126,13 @@ def _validate_ids(
125
126
  live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
126
127
  assert (
127
128
  not live_sub[id_col].duplicated().any()
128
- ), f"Duplicates detected in {id_col} for live period."
129
+ ), f"invalid_submission_ids: Duplicates detected in {id_col} for live period."
129
130
 
130
131
  # join on live_ids and ensure min tickers reached
131
- assert (
132
- len(live_sub) >= min_tickers
133
- ), "Not enough stocks submitted. Are you using the latest live ids or live universe?"
132
+ assert len(live_sub) >= min_tickers, (
133
+ "invalid_submission_ids: Not enough stocks submitted."
134
+ " Are you using the latest live ids or live universe?"
135
+ )
134
136
 
135
137
  invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
136
138
  return live_sub, invalid_tickers
@@ -176,10 +178,11 @@ def remap_ids(
176
178
  def clean_submission(
177
179
  live_ids: pd.Series | pd.DataFrame,
178
180
  predictions: pd.DataFrame,
179
- name: str,
181
+ ticker_col: str,
182
+ signal_col: str,
183
+ rename_as: Optional[str],
180
184
  id_col: str,
181
185
  rank_and_fill: bool,
182
- tournament: int,
183
186
  ) -> pd.Series:
184
187
  """Prepare predictions for submission to Numerai.
185
188
  Filters out ids not in live data, drops duplicates, sets ids as index,
@@ -192,10 +195,14 @@ def clean_submission(
192
195
  Arguments:
193
196
  live_ids: pd.Series - the ids in the live data
194
197
  predictions: pd.DataFrame - the predictions to clean
195
- name: str - the name of the submission (used for renaming)
198
+ ticker_col: str - the name of the ids column
199
+ signal_col: str - the name of the predictions column
200
+ rename_as: Optional[str] - the string to which the submission should be renamed
196
201
  id_col: str - the column name of the ids
197
202
  rank_and_fill: bool - whether to rank and fill NaNs with 0.5
198
- left_join_ids: bool - whether to left join the predictions onto the ids
203
+
204
+ Returns:
205
+ pd.Series - the cleaned prediction series with ids as index
199
206
  """
200
207
  assert len(live_ids) > 0, "live_ids must not be empty"
201
208
  if isinstance(live_ids, pd.DataFrame):
@@ -204,16 +211,6 @@ def clean_submission(
204
211
  assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
205
212
  assert len(predictions) > 0, "predictions must not be empty"
206
213
 
207
- header_fn = {
208
- 8: validate_headers_numerai,
209
- 11: validate_headers_signals,
210
- 12: validate_headers_crypto,
211
- }
212
- assert (
213
- tournament in header_fn
214
- ), f"Unsupported tournament {tournament} for cleaning predictions"
215
- ticker_col, signal_col = header_fn[tournament](predictions)
216
-
217
214
  clean_preds = (
218
215
  remap_ids(predictions, live_ids, ticker_col, id_col)
219
216
  # drop NaNs and duplicates
@@ -223,32 +220,42 @@ def clean_submission(
223
220
  .set_index(id_col)
224
221
  .sort_index()
225
222
  # rename to given name
226
- .rename(columns={signal_col: name})
227
- )[name]
223
+ .rename(columns={signal_col: rename_as})
224
+ )[rename_as]
228
225
  # rank and fill with 0.5
229
226
  if rank_and_fill:
230
227
  clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
231
228
  return clean_preds
232
229
 
233
230
 
234
- def clean_submission_numerai(
235
- live_ids: pd.Series, submission: pd.DataFrame, user_id: str
231
+ def validate_and_clean_submission_numerai(
232
+ universe: pd.Series,
233
+ submission: pd.DataFrame,
234
+ id_col: str = "id",
235
+ rename_as: Optional[str] = None,
236
+ rank_and_fill: bool = False,
236
237
  ) -> pd.Series:
238
+ ticker_col, signal_col = validate_headers_numerai(submission)
239
+ filtered_sub, invalid_tickers = validate_ids_numerai(
240
+ universe, submission, ticker_col
241
+ )
242
+ validate_values(filtered_sub, signal_col)
237
243
  return clean_submission(
238
- live_ids=live_ids,
239
- predictions=submission,
240
- name=user_id,
241
- id_col="id",
242
- rank_and_fill=True,
243
- tournament=8,
244
+ live_ids=universe,
245
+ predictions=filtered_sub,
246
+ ticker_col=ticker_col,
247
+ signal_col=signal_col,
248
+ rename_as=rename_as,
249
+ id_col=id_col,
250
+ rank_and_fill=rank_and_fill,
244
251
  )
245
252
 
246
253
 
247
- def clean_submission_signals(
254
+ def validate_and_clean_submission_signals(
248
255
  universe: pd.DataFrame,
249
256
  submission: pd.DataFrame,
250
- submission_id: str,
251
- index_col: str,
257
+ id_col: str,
258
+ rename_as: Optional[str] = None,
252
259
  rank_and_fill: bool = True,
253
260
  ) -> pd.Series:
254
261
  # drop data_type and date columns if they exist
@@ -263,28 +270,40 @@ def clean_submission_signals(
263
270
  if date_col in list(submission.columns)
264
271
  ]
265
272
  submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
273
+ ticker_col, signal_col = validate_headers_signals(submission)
274
+ filtered_sub, invalid_tickers = validate_ids_signals(
275
+ universe[ticker_col], submission, ticker_col
276
+ )
277
+ validate_values(filtered_sub, signal_col)
266
278
  return clean_submission(
267
279
  live_ids=universe,
268
- predictions=submission,
269
- name=submission_id,
270
- id_col=index_col,
280
+ predictions=filtered_sub,
281
+ ticker_col=ticker_col,
282
+ signal_col=signal_col,
283
+ rename_as=rename_as,
284
+ id_col=id_col,
271
285
  rank_and_fill=rank_and_fill,
272
- tournament=11,
273
286
  )
274
287
 
275
288
 
276
- def clean_submission_crypto(
289
+ def validate_and_clean_submission_crypto(
277
290
  universe: pd.DataFrame,
278
291
  submission: pd.DataFrame,
279
- submission_id: str,
280
- index_col: str,
292
+ id_col: str = "symbol",
293
+ rename_as: Optional[str] = None,
281
294
  rank_and_fill: bool = True,
282
295
  ):
296
+ ticker_col, signal_col = validate_headers_crypto(submission)
297
+ filtered_sub, invalid_tickers = validate_ids_crypto(
298
+ universe[ticker_col], submission, ticker_col
299
+ )
300
+ validate_values(filtered_sub, signal_col)
283
301
  return clean_submission(
284
302
  live_ids=universe,
285
- predictions=submission,
286
- name=submission_id,
287
- id_col=index_col,
303
+ predictions=filtered_sub,
304
+ ticker_col=ticker_col,
305
+ signal_col=signal_col,
306
+ rename_as=rename_as,
307
+ id_col=id_col,
288
308
  rank_and_fill=rank_and_fill,
289
- tournament=12,
290
309
  )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev3"
3
+ version = "0.5.0.dev4"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}