numerai-tools 0.5.0.dev4__tar.gz → 0.5.0.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev4
3
+ Version: 0.5.0.dev5
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -7,7 +7,8 @@ from numerai_tools.scoring import (
7
7
  generate_neutralized_weights,
8
8
  )
9
9
  from numerai_tools.submissions import (
10
- validate_and_clean_submission_signals,
10
+ validate_submission_signals,
11
+ clean_submission,
11
12
  remap_ids,
12
13
  )
13
14
 
@@ -106,11 +107,20 @@ def calculate_max_churn_and_turnover(
106
107
  prev_week_max_turnover -- the maximum turnover from previous submissions
107
108
  """
108
109
  universe = universe.reset_index()
109
- curr_sub_vector = validate_and_clean_submission_signals(
110
+ (
111
+ curr_ticker_col,
112
+ curr_signal_col,
113
+ curr_sub,
114
+ _,
115
+ ) = validate_submission_signals(
110
116
  universe=universe,
111
117
  submission=curr_sub,
112
- id_col=curr_ticker_col,
113
- rename_as=curr_signal_col,
118
+ )
119
+ curr_sub_vector = clean_submission(
120
+ universe=universe,
121
+ submission=curr_sub,
122
+ src_id_col=curr_ticker_col,
123
+ src_signal_col=curr_signal_col,
114
124
  rank_and_fill=True,
115
125
  )
116
126
  churn_stats = []
@@ -122,11 +132,22 @@ def calculate_max_churn_and_turnover(
122
132
  prev_sub = prev_week_subs[datestamp]
123
133
  prev_neutralizer = prev_neutralizers[datestamp]
124
134
  prev_weight = prev_sample_weights[datestamp]
125
- filtered_prev_sub = validate_and_clean_submission_signals(
135
+ (
136
+ prev_ticker_col,
137
+ prev_signal_col,
138
+ prev_sub,
139
+ _,
140
+ ) = validate_submission_signals(
141
+ universe=universe,
142
+ submission=prev_sub,
143
+ )
144
+ filtered_prev_sub = clean_submission(
126
145
  universe=universe,
127
146
  submission=prev_sub,
128
- id_col=curr_ticker_col,
129
- rename_as=curr_signal_col,
147
+ src_id_col=prev_ticker_col,
148
+ src_signal_col=prev_signal_col,
149
+ dst_id_col=curr_ticker_col,
150
+ dst_signal_col=curr_signal_col,
130
151
  rank_and_fill=True,
131
152
  )
132
153
  prev_neutralizer = (
@@ -28,7 +28,10 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  def _validate_headers(
31
- expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
31
+ submission: pd.DataFrame,
32
+ expected_id_cols: List[str],
33
+ expected_pred_cols: List[str],
34
+ other_cols: Optional[List[str]] = None,
32
35
  ) -> Tuple[str, str]:
33
36
  """Validate the given submission has the right headers.
34
37
  It is recommended to use one of the following functions instead of this one:
@@ -37,6 +40,9 @@ def _validate_headers(
37
40
 
38
41
  Arguments:
39
42
  submission -- pandas DataFrame of the submission
43
+ expected_id_cols -- list of expected id columns
44
+ expected_pred_cols -- list of expected prediction columns
45
+ other_cols -- optional list of other columns that can be present in the submission
40
46
 
41
47
  Return Tuple[str, str]:
42
48
  - string name of the id column
@@ -47,6 +53,13 @@ def _validate_headers(
47
53
  for ticker_col in expected_id_cols
48
54
  for signal_col in expected_pred_cols
49
55
  ]
56
+ if other_cols is not None:
57
+ expected_headers += [
58
+ [ticker_col, signal_col, other_col]
59
+ for ticker_col in expected_id_cols
60
+ for signal_col in expected_pred_cols
61
+ for other_col in other_cols
62
+ ]
50
63
  columns = submission.columns
51
64
  valid_headers = list(columns) in expected_headers
52
65
  assert valid_headers, (
@@ -58,19 +71,26 @@ def _validate_headers(
58
71
 
59
72
  def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
60
73
  return _validate_headers(
61
- NUMERAI_ALLOWED_ID_COLS, NUMERAI_ALLOWED_PRED_COLS, submission
74
+ submission,
75
+ NUMERAI_ALLOWED_ID_COLS,
76
+ NUMERAI_ALLOWED_PRED_COLS,
62
77
  )
63
78
 
64
79
 
65
80
  def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
66
81
  return _validate_headers(
67
- SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
82
+ submission,
83
+ SIGNALS_ALLOWED_ID_COLS,
84
+ SIGNALS_ALLOWED_PRED_COLS,
85
+ SIGNALS_ALLOWED_DATE_COLS,
68
86
  )
69
87
 
70
88
 
71
89
  def validate_headers_crypto(submission: pd.DataFrame) -> Tuple[str, str]:
72
90
  return _validate_headers(
73
- CRYPTO_ALLOWED_ID_COLS, CRYPTO_ALLOWED_PRED_COLS, submission
91
+ submission,
92
+ CRYPTO_ALLOWED_ID_COLS,
93
+ CRYPTO_ALLOWED_PRED_COLS,
74
94
  )
75
95
 
76
96
 
@@ -156,12 +176,94 @@ def validate_ids_crypto(
156
176
  return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
157
177
 
158
178
 
179
+ def validate_submission_numerai(
180
+ universe: pd.Series, submission: pd.DataFrame
181
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
182
+ """Validate the headers, ids, and values for a submission.
183
+
184
+ Arguments:
185
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
186
+ submission: pd.DataFrame - the predictions to validate
187
+
188
+ Returns:
189
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
190
+ filtered submission, and list of invalid tickers
191
+ """
192
+ ticker_col, signal_col = validate_headers_numerai(submission)
193
+ filtered_sub, invalid_tickers = validate_ids_numerai(
194
+ universe, submission, ticker_col
195
+ )
196
+ validate_values(filtered_sub, signal_col)
197
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
198
+
199
+
200
+ def validate_submission_signals(
201
+ universe: pd.DataFrame, submission: pd.DataFrame
202
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
203
+ """Validate the headers, ids, and values for a submission.
204
+
205
+ Arguments:
206
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
207
+ submission: pd.DataFrame - the predictions to validate
208
+
209
+ Returns:
210
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
211
+ filtered submission, and list of invalid tickers
212
+ """
213
+ # drop data_type and date columns if they exist
214
+ if "data_type" in submission.columns:
215
+ logger.warning(
216
+ "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
217
+ "Please remove the data_type column from your Signals submission."
218
+ )
219
+ submission = submission.drop(columns=["data_type"], errors="ignore")
220
+ ticker_col, signal_col = validate_headers_signals(submission)
221
+ filtered_sub, invalid_tickers = validate_ids_signals(
222
+ universe[ticker_col], submission, ticker_col
223
+ )
224
+ validate_values(filtered_sub, signal_col)
225
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
226
+
227
+
228
+ def validate_submission_crypto(
229
+ universe: pd.DataFrame, submission: pd.DataFrame
230
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
231
+ """Validate the headers, ids, and values for a submission.
232
+
233
+ Arguments:
234
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
235
+ submission: pd.DataFrame - the predictions to validate
236
+
237
+ Returns:
238
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
239
+ filtered submission, and list of invalid tickers
240
+ """
241
+ print(universe)
242
+ ticker_col, signal_col = validate_headers_crypto(submission)
243
+ filtered_sub, invalid_tickers = validate_ids_crypto(
244
+ universe[ticker_col], submission, ticker_col
245
+ )
246
+ validate_values(filtered_sub, signal_col)
247
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
248
+
249
+
159
250
  def remap_ids(
160
251
  data: pd.DataFrame,
161
252
  ticker_map: pd.Series | pd.DataFrame,
162
253
  src_id_col: str,
163
254
  dst_id_col: str,
164
255
  ) -> pd.DataFrame:
256
+ """Join the data to the ticker map based on source ids
257
+ and remap to the destination ids. If the ticker is a Series, it is assumed that
258
+ src_id_col and dst_id_col are the same, and the ticker map is simply used to
259
+ ensure the data has all ids in the ticker map.
260
+
261
+ Arguments:
262
+ data: pd.DataFrame - the data to remap
263
+ ticker_map: pd.Series | pd.DataFrame - the mapping of source ids to destination ids
264
+ src_id_col: str - the name of the source ids column in the data
265
+ dst_id_col: str - the name of the destination ids column in the ticker map
266
+ """
165
267
  # first, index the universe and data on the source ids
166
268
  indexed_map = ticker_map.reset_index().set_index(src_id_col)
167
269
  indexed_data = data.set_index(src_id_col)
@@ -176,134 +278,58 @@ def remap_ids(
176
278
 
177
279
 
178
280
  def clean_submission(
179
- live_ids: pd.Series | pd.DataFrame,
180
- predictions: pd.DataFrame,
181
- ticker_col: str,
182
- signal_col: str,
183
- rename_as: Optional[str],
184
- id_col: str,
185
- rank_and_fill: bool,
281
+ universe: pd.Series | pd.DataFrame,
282
+ submission: pd.DataFrame,
283
+ src_id_col: str,
284
+ src_signal_col: str,
285
+ dst_id_col: Optional[str] = None,
286
+ dst_signal_col: Optional[str] = None,
287
+ rank_and_fill: bool = False,
186
288
  ) -> pd.Series:
187
- """Prepare predictions for submission to Numerai.
188
- Filters out ids not in live data, drops duplicates, sets ids as index,
189
- then optionally ranks (keeping ties) and fills NaNs with 0.5.
289
+ """Prepares your submission for uploading to a Numerai tournament.
290
+ Joins your submission to the universe, remaps ids as neded, drops
291
+ duplicates, sets ids as index, renames the series, then optionally
292
+ tie-kept ranks and fills NaNs with 0.5.
190
293
 
191
294
  This function is used in Numerai to clean submissions for use in the
192
- Meta Model and scoring. We only rank and fill in preparation for scoring
193
- Signals and Crypto submissions.
295
+ Meta Model and scoring. We rank and fill submissions before scoring.
194
296
 
195
297
  Arguments:
196
- live_ids: pd.Series - the ids in the live data
197
- predictions: pd.DataFrame - the predictions to clean
198
- ticker_col: str - the name of the ids column
199
- signal_col: str - the name of the predictions column
200
- rename_as: Optional[str] - the string to which the submission should be renamed
201
- id_col: str - the column name of the ids
202
- rank_and_fill: bool - whether to rank and fill NaNs with 0.5
298
+ universe: pd.Series - the live universe of ids on which the predictions are based
299
+ submission: pd.DataFrame - the submission to clean
300
+ src_id_col: str - the name of the ids column
301
+ src_signal_col: str - the name of the predictions column
302
+ dst_id_col: Optional[str] - optional name of the id column to map the ids to
303
+ dst_signal_col: Optional[str] - optional name of the signal column to rename the submission to
304
+ rank_and_fill: bool - whether to call tie_kept_rank and then fill NaNs with 0.5
203
305
 
204
306
  Returns:
205
- pd.Series - the cleaned prediction series with ids as index
307
+ pd.Series - the cleaned, properly indexed submission
206
308
  """
207
- assert len(live_ids) > 0, "live_ids must not be empty"
208
- if isinstance(live_ids, pd.DataFrame):
209
- assert live_ids.isna().sum().sum() == 0, "live_ids must not contain NaNs"
309
+ assert len(universe) > 0, "universe must not be empty"
310
+ if isinstance(universe, pd.DataFrame):
311
+ assert universe.isna().sum().sum() == 0, "universe must not contain NaNs"
210
312
  else:
211
- assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
212
- assert len(predictions) > 0, "predictions must not be empty"
313
+ assert universe.isna().sum() == 0, "universe must not contain NaNs"
314
+ assert len(submission) > 0, "predictions must not be empty"
315
+
316
+ if dst_id_col is None:
317
+ dst_id_col = src_id_col
318
+ if dst_signal_col is None:
319
+ dst_signal_col = src_signal_col
213
320
 
214
321
  clean_preds = (
215
- remap_ids(predictions, live_ids, ticker_col, id_col)
322
+ remap_ids(submission, universe, src_id_col, dst_id_col)
216
323
  # drop NaNs and duplicates
217
- .dropna(subset=[id_col])
218
- .drop_duplicates(subset=id_col, keep="first")
324
+ .dropna(subset=[dst_id_col])
325
+ .drop_duplicates(subset=dst_id_col, keep="first")
219
326
  # set ids as index and sort
220
- .set_index(id_col)
327
+ .set_index(dst_id_col)
221
328
  .sort_index()
222
329
  # rename to given name
223
- .rename(columns={signal_col: rename_as})
224
- )[rename_as]
330
+ .rename(columns={src_signal_col: dst_signal_col})
331
+ )[dst_signal_col]
225
332
  # rank and fill with 0.5
226
333
  if rank_and_fill:
227
334
  clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
228
335
  return clean_preds
229
-
230
-
231
- def validate_and_clean_submission_numerai(
232
- universe: pd.Series,
233
- submission: pd.DataFrame,
234
- id_col: str = "id",
235
- rename_as: Optional[str] = None,
236
- rank_and_fill: bool = False,
237
- ) -> pd.Series:
238
- ticker_col, signal_col = validate_headers_numerai(submission)
239
- filtered_sub, invalid_tickers = validate_ids_numerai(
240
- universe, submission, ticker_col
241
- )
242
- validate_values(filtered_sub, signal_col)
243
- return clean_submission(
244
- live_ids=universe,
245
- predictions=filtered_sub,
246
- ticker_col=ticker_col,
247
- signal_col=signal_col,
248
- rename_as=rename_as,
249
- id_col=id_col,
250
- rank_and_fill=rank_and_fill,
251
- )
252
-
253
-
254
- def validate_and_clean_submission_signals(
255
- universe: pd.DataFrame,
256
- submission: pd.DataFrame,
257
- id_col: str,
258
- rename_as: Optional[str] = None,
259
- rank_and_fill: bool = True,
260
- ) -> pd.Series:
261
- # drop data_type and date columns if they exist
262
- if "data_type" in submission.columns:
263
- logger.warning(
264
- "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
265
- "Please remove the data_type column from your Signals submission."
266
- )
267
- date_col = [
268
- date_col
269
- for date_col in SIGNALS_ALLOWED_DATE_COLS
270
- if date_col in list(submission.columns)
271
- ]
272
- submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
273
- ticker_col, signal_col = validate_headers_signals(submission)
274
- filtered_sub, invalid_tickers = validate_ids_signals(
275
- universe[ticker_col], submission, ticker_col
276
- )
277
- validate_values(filtered_sub, signal_col)
278
- return clean_submission(
279
- live_ids=universe,
280
- predictions=filtered_sub,
281
- ticker_col=ticker_col,
282
- signal_col=signal_col,
283
- rename_as=rename_as,
284
- id_col=id_col,
285
- rank_and_fill=rank_and_fill,
286
- )
287
-
288
-
289
- def validate_and_clean_submission_crypto(
290
- universe: pd.DataFrame,
291
- submission: pd.DataFrame,
292
- id_col: str = "symbol",
293
- rename_as: Optional[str] = None,
294
- rank_and_fill: bool = True,
295
- ):
296
- ticker_col, signal_col = validate_headers_crypto(submission)
297
- filtered_sub, invalid_tickers = validate_ids_crypto(
298
- universe[ticker_col], submission, ticker_col
299
- )
300
- validate_values(filtered_sub, signal_col)
301
- return clean_submission(
302
- live_ids=universe,
303
- predictions=filtered_sub,
304
- ticker_col=ticker_col,
305
- signal_col=signal_col,
306
- rename_as=rename_as,
307
- id_col=id_col,
308
- rank_and_fill=rank_and_fill,
309
- )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev4"
3
+ version = "0.5.0.dev5"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}