numerai-tools 0.5.0.dev4__tar.gz → 0.5.0.dev6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev4
3
+ Version: 0.5.0.dev6
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -7,7 +7,8 @@ from numerai_tools.scoring import (
7
7
  generate_neutralized_weights,
8
8
  )
9
9
  from numerai_tools.submissions import (
10
- validate_and_clean_submission_signals,
10
+ validate_submission_signals,
11
+ clean_submission,
11
12
  remap_ids,
12
13
  )
13
14
 
@@ -65,9 +66,6 @@ def turnover(
65
66
  Arguments:
66
67
  s1: pd.Series - the first series to compare
67
68
  s2: pd.Series - the second series to compare
68
- top_bottom: Optional[int] - the number of top and bottom predictions to use
69
- when calculating the correlation. Results in
70
- 2*top_bottom predictions.
71
69
 
72
70
  Returns:
73
71
  float - the turnover between the two series
@@ -106,27 +104,50 @@ def calculate_max_churn_and_turnover(
106
104
  prev_week_max_turnover -- the maximum turnover from previous submissions
107
105
  """
108
106
  universe = universe.reset_index()
109
- curr_sub_vector = validate_and_clean_submission_signals(
107
+ (
108
+ curr_ticker_col,
109
+ curr_signal_col,
110
+ curr_sub,
111
+ _,
112
+ ) = validate_submission_signals(
110
113
  universe=universe,
111
114
  submission=curr_sub,
112
- id_col=curr_ticker_col,
113
- rename_as=curr_signal_col,
115
+ )
116
+ curr_sub_vector = clean_submission(
117
+ universe=universe,
118
+ submission=curr_sub,
119
+ src_id_col=curr_ticker_col,
120
+ src_signal_col=curr_signal_col,
114
121
  rank_and_fill=True,
115
122
  )
116
123
  churn_stats = []
117
124
  turnover_stats = []
118
125
  neutralized_weights = generate_neutralized_weights(
119
- curr_sub_vector.to_frame(), curr_neutralizer, curr_weight
126
+ curr_sub_vector.to_frame(),
127
+ curr_neutralizer,
128
+ curr_weight,
129
+ center_and_normalize=True,
120
130
  )
121
131
  for datestamp in prev_week_subs:
122
132
  prev_sub = prev_week_subs[datestamp]
123
133
  prev_neutralizer = prev_neutralizers[datestamp]
124
134
  prev_weight = prev_sample_weights[datestamp]
125
- filtered_prev_sub = validate_and_clean_submission_signals(
135
+ (
136
+ prev_ticker_col,
137
+ prev_signal_col,
138
+ prev_sub,
139
+ _,
140
+ ) = validate_submission_signals(
141
+ universe=universe,
142
+ submission=prev_sub,
143
+ )
144
+ filtered_prev_sub = clean_submission(
126
145
  universe=universe,
127
146
  submission=prev_sub,
128
- id_col=curr_ticker_col,
129
- rename_as=curr_signal_col,
147
+ src_id_col=prev_ticker_col,
148
+ src_signal_col=prev_signal_col,
149
+ dst_id_col=curr_ticker_col,
150
+ dst_signal_col=curr_signal_col,
130
151
  rank_and_fill=True,
131
152
  )
132
153
  prev_neutralizer = (
@@ -146,7 +167,10 @@ def calculate_max_churn_and_turnover(
146
167
  curr_ticker_col,
147
168
  ).set_index(curr_ticker_col)[prev_weight.name]
148
169
  prev_neutralized_weights = generate_neutralized_weights(
149
- filtered_prev_sub.to_frame(), prev_neutralizer, prev_weight
170
+ filtered_prev_sub.to_frame(),
171
+ prev_neutralizer,
172
+ prev_weight,
173
+ center_and_normalize=True,
150
174
  )
151
175
  try:
152
176
  churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
@@ -28,7 +28,9 @@ logger = logging.getLogger(__name__)
28
28
 
29
29
 
30
30
  def _validate_headers(
31
- expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
31
+ submission: pd.DataFrame,
32
+ expected_id_cols: List[str],
33
+ expected_pred_cols: List[str],
32
34
  ) -> Tuple[str, str]:
33
35
  """Validate the given submission has the right headers.
34
36
  It is recommended to use one of the following functions instead of this one:
@@ -37,6 +39,8 @@ def _validate_headers(
37
39
 
38
40
  Arguments:
39
41
  submission -- pandas DataFrame of the submission
42
+ expected_id_cols -- list of expected id columns
43
+ expected_pred_cols -- list of expected prediction columns
40
44
 
41
45
  Return Tuple[str, str]:
42
46
  - string name of the id column
@@ -58,19 +62,43 @@ def _validate_headers(
58
62
 
59
63
  def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
60
64
  return _validate_headers(
61
- NUMERAI_ALLOWED_ID_COLS, NUMERAI_ALLOWED_PRED_COLS, submission
65
+ submission,
66
+ NUMERAI_ALLOWED_ID_COLS,
67
+ NUMERAI_ALLOWED_PRED_COLS,
62
68
  )
63
69
 
64
70
 
65
- def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
66
- return _validate_headers(
67
- SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
71
+ def validate_headers_signals(
72
+ submission: pd.DataFrame, assert_date_col: bool = False
73
+ ) -> Tuple[str, str, Optional[str]]:
74
+ # remove date columns if they exist and store them temporarily
75
+ date_col_name: Optional[str] = None
76
+ date_col: Optional[pd.Series] = None
77
+ for col in submission.columns:
78
+ if col in SIGNALS_ALLOWED_DATE_COLS:
79
+ date_col_name = col
80
+ date_col = submission[date_col_name].copy()
81
+ submission = submission.drop(columns=date_col_name, errors="ignore")
82
+ break
83
+ if assert_date_col:
84
+ assert (
85
+ date_col_name is not None
86
+ ), "invalid_submission_headers: submission must contain a date column"
87
+ ticker_col, signal_col = _validate_headers(
88
+ submission,
89
+ SIGNALS_ALLOWED_ID_COLS,
90
+ SIGNALS_ALLOWED_PRED_COLS,
68
91
  )
92
+ if date_col is not None:
93
+ submission[date_col_name] = date_col
94
+ return ticker_col, signal_col, date_col_name
69
95
 
70
96
 
71
97
  def validate_headers_crypto(submission: pd.DataFrame) -> Tuple[str, str]:
72
98
  return _validate_headers(
73
- CRYPTO_ALLOWED_ID_COLS, CRYPTO_ALLOWED_PRED_COLS, submission
99
+ submission,
100
+ CRYPTO_ALLOWED_ID_COLS,
101
+ CRYPTO_ALLOWED_PRED_COLS,
74
102
  )
75
103
 
76
104
 
@@ -156,12 +184,93 @@ def validate_ids_crypto(
156
184
  return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
157
185
 
158
186
 
187
+ def validate_submission_numerai(
188
+ universe: pd.Series, submission: pd.DataFrame
189
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
190
+ """Validate the headers, ids, and values for a submission.
191
+
192
+ Arguments:
193
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
194
+ submission: pd.DataFrame - the predictions to validate
195
+
196
+ Returns:
197
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
198
+ filtered submission, and list of invalid tickers
199
+ """
200
+ ticker_col, signal_col = validate_headers_numerai(submission)
201
+ filtered_sub, invalid_tickers = validate_ids_numerai(
202
+ universe, submission, ticker_col
203
+ )
204
+ validate_values(filtered_sub, signal_col)
205
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
206
+
207
+
208
+ def validate_submission_signals(
209
+ universe: pd.DataFrame, submission: pd.DataFrame
210
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
211
+ """Validate the headers, ids, and values for a submission.
212
+
213
+ Arguments:
214
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
215
+ submission: pd.DataFrame - the predictions to validate
216
+
217
+ Returns:
218
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
219
+ filtered submission, and list of invalid tickers
220
+ """
221
+ # drop data_type and date columns if they exist
222
+ if "data_type" in submission.columns:
223
+ logger.warning(
224
+ "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
225
+ "Please remove the data_type column from your Signals submission."
226
+ )
227
+ submission = submission.drop(columns=["data_type"], errors="ignore")
228
+ ticker_col, signal_col, _ = validate_headers_signals(submission)
229
+ filtered_sub, invalid_tickers = validate_ids_signals(
230
+ universe[ticker_col], submission, ticker_col
231
+ )
232
+ validate_values(filtered_sub, signal_col)
233
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
234
+
235
+
236
+ def validate_submission_crypto(
237
+ universe: pd.DataFrame, submission: pd.DataFrame
238
+ ) -> Tuple[str, str, pd.DataFrame, List[str]]:
239
+ """Validate the headers, ids, and values for a submission.
240
+
241
+ Arguments:
242
+ universe: pd.DataFrame - the live universe of ids on which the predictions are based
243
+ submission: pd.DataFrame - the predictions to validate
244
+
245
+ Returns:
246
+ Tuple[str, str, pd.DataFrame, List[str]] - the validated ticker column, signal column,
247
+ filtered submission, and list of invalid tickers
248
+ """
249
+ ticker_col, signal_col = validate_headers_crypto(submission)
250
+ filtered_sub, invalid_tickers = validate_ids_crypto(
251
+ universe[ticker_col], submission, ticker_col
252
+ )
253
+ validate_values(filtered_sub, signal_col)
254
+ return ticker_col, signal_col, filtered_sub, invalid_tickers
255
+
256
+
159
257
  def remap_ids(
160
258
  data: pd.DataFrame,
161
259
  ticker_map: pd.Series | pd.DataFrame,
162
260
  src_id_col: str,
163
261
  dst_id_col: str,
164
262
  ) -> pd.DataFrame:
263
+ """Join the data to the ticker map based on source ids
264
+ and remap to the destination ids. If the ticker is a Series, it is assumed that
265
+ src_id_col and dst_id_col are the same, and the ticker map is simply used to
266
+ ensure the data has all ids in the ticker map.
267
+
268
+ Arguments:
269
+ data: pd.DataFrame - the data to remap
270
+ ticker_map: pd.Series | pd.DataFrame - the mapping of source ids to destination ids
271
+ src_id_col: str - the name of the source ids column in the data
272
+ dst_id_col: str - the name of the destination ids column in the ticker map
273
+ """
165
274
  # first, index the universe and data on the source ids
166
275
  indexed_map = ticker_map.reset_index().set_index(src_id_col)
167
276
  indexed_data = data.set_index(src_id_col)
@@ -176,134 +285,58 @@ def remap_ids(
176
285
 
177
286
 
178
287
  def clean_submission(
179
- live_ids: pd.Series | pd.DataFrame,
180
- predictions: pd.DataFrame,
181
- ticker_col: str,
182
- signal_col: str,
183
- rename_as: Optional[str],
184
- id_col: str,
185
- rank_and_fill: bool,
288
+ universe: pd.Series | pd.DataFrame,
289
+ submission: pd.DataFrame,
290
+ src_id_col: str,
291
+ src_signal_col: str,
292
+ dst_id_col: Optional[str] = None,
293
+ dst_signal_col: Optional[str] = None,
294
+ rank_and_fill: bool = False,
186
295
  ) -> pd.Series:
187
- """Prepare predictions for submission to Numerai.
188
- Filters out ids not in live data, drops duplicates, sets ids as index,
189
- then optionally ranks (keeping ties) and fills NaNs with 0.5.
296
+ """Prepares your submission for uploading to a Numerai tournament.
297
+ Joins your submission to the universe, remaps ids as neded, drops
298
+ duplicates, sets ids as index, renames the series, then optionally
299
+ tie-kept ranks and fills NaNs with 0.5.
190
300
 
191
301
  This function is used in Numerai to clean submissions for use in the
192
- Meta Model and scoring. We only rank and fill in preparation for scoring
193
- Signals and Crypto submissions.
302
+ Meta Model and scoring. We rank and fill submissions before scoring.
194
303
 
195
304
  Arguments:
196
- live_ids: pd.Series - the ids in the live data
197
- predictions: pd.DataFrame - the predictions to clean
198
- ticker_col: str - the name of the ids column
199
- signal_col: str - the name of the predictions column
200
- rename_as: Optional[str] - the string to which the submission should be renamed
201
- id_col: str - the column name of the ids
202
- rank_and_fill: bool - whether to rank and fill NaNs with 0.5
305
+ universe: pd.Series - the live universe of ids on which the predictions are based
306
+ submission: pd.DataFrame - the submission to clean
307
+ src_id_col: str - the name of the ids column
308
+ src_signal_col: str - the name of the predictions column
309
+ dst_id_col: Optional[str] - optional name of the id column to map the ids to
310
+ dst_signal_col: Optional[str] - optional name of the signal column to rename the submission to
311
+ rank_and_fill: bool - whether to call tie_kept_rank and then fill NaNs with 0.5
203
312
 
204
313
  Returns:
205
- pd.Series - the cleaned prediction series with ids as index
314
+ pd.Series - the cleaned, properly indexed submission
206
315
  """
207
- assert len(live_ids) > 0, "live_ids must not be empty"
208
- if isinstance(live_ids, pd.DataFrame):
209
- assert live_ids.isna().sum().sum() == 0, "live_ids must not contain NaNs"
316
+ assert len(universe) > 0, "universe must not be empty"
317
+ if isinstance(universe, pd.DataFrame):
318
+ assert universe.isna().sum().sum() == 0, "universe must not contain NaNs"
210
319
  else:
211
- assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
212
- assert len(predictions) > 0, "predictions must not be empty"
320
+ assert universe.isna().sum() == 0, "universe must not contain NaNs"
321
+ assert len(submission) > 0, "predictions must not be empty"
322
+
323
+ if dst_id_col is None:
324
+ dst_id_col = src_id_col
325
+ if dst_signal_col is None:
326
+ dst_signal_col = src_signal_col
213
327
 
214
328
  clean_preds = (
215
- remap_ids(predictions, live_ids, ticker_col, id_col)
329
+ remap_ids(submission, universe, src_id_col, dst_id_col)
216
330
  # drop NaNs and duplicates
217
- .dropna(subset=[id_col])
218
- .drop_duplicates(subset=id_col, keep="first")
331
+ .dropna(subset=[dst_id_col])
332
+ .drop_duplicates(subset=dst_id_col, keep="first")
219
333
  # set ids as index and sort
220
- .set_index(id_col)
334
+ .set_index(dst_id_col)
221
335
  .sort_index()
222
336
  # rename to given name
223
- .rename(columns={signal_col: rename_as})
224
- )[rename_as]
337
+ .rename(columns={src_signal_col: dst_signal_col})
338
+ )[dst_signal_col]
225
339
  # rank and fill with 0.5
226
340
  if rank_and_fill:
227
341
  clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
228
342
  return clean_preds
229
-
230
-
231
- def validate_and_clean_submission_numerai(
232
- universe: pd.Series,
233
- submission: pd.DataFrame,
234
- id_col: str = "id",
235
- rename_as: Optional[str] = None,
236
- rank_and_fill: bool = False,
237
- ) -> pd.Series:
238
- ticker_col, signal_col = validate_headers_numerai(submission)
239
- filtered_sub, invalid_tickers = validate_ids_numerai(
240
- universe, submission, ticker_col
241
- )
242
- validate_values(filtered_sub, signal_col)
243
- return clean_submission(
244
- live_ids=universe,
245
- predictions=filtered_sub,
246
- ticker_col=ticker_col,
247
- signal_col=signal_col,
248
- rename_as=rename_as,
249
- id_col=id_col,
250
- rank_and_fill=rank_and_fill,
251
- )
252
-
253
-
254
- def validate_and_clean_submission_signals(
255
- universe: pd.DataFrame,
256
- submission: pd.DataFrame,
257
- id_col: str,
258
- rename_as: Optional[str] = None,
259
- rank_and_fill: bool = True,
260
- ) -> pd.Series:
261
- # drop data_type and date columns if they exist
262
- if "data_type" in submission.columns:
263
- logger.warning(
264
- "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
265
- "Please remove the data_type column from your Signals submission."
266
- )
267
- date_col = [
268
- date_col
269
- for date_col in SIGNALS_ALLOWED_DATE_COLS
270
- if date_col in list(submission.columns)
271
- ]
272
- submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
273
- ticker_col, signal_col = validate_headers_signals(submission)
274
- filtered_sub, invalid_tickers = validate_ids_signals(
275
- universe[ticker_col], submission, ticker_col
276
- )
277
- validate_values(filtered_sub, signal_col)
278
- return clean_submission(
279
- live_ids=universe,
280
- predictions=filtered_sub,
281
- ticker_col=ticker_col,
282
- signal_col=signal_col,
283
- rename_as=rename_as,
284
- id_col=id_col,
285
- rank_and_fill=rank_and_fill,
286
- )
287
-
288
-
289
- def validate_and_clean_submission_crypto(
290
- universe: pd.DataFrame,
291
- submission: pd.DataFrame,
292
- id_col: str = "symbol",
293
- rename_as: Optional[str] = None,
294
- rank_and_fill: bool = True,
295
- ):
296
- ticker_col, signal_col = validate_headers_crypto(submission)
297
- filtered_sub, invalid_tickers = validate_ids_crypto(
298
- universe[ticker_col], submission, ticker_col
299
- )
300
- validate_values(filtered_sub, signal_col)
301
- return clean_submission(
302
- live_ids=universe,
303
- predictions=filtered_sub,
304
- ticker_col=ticker_col,
305
- signal_col=signal_col,
306
- rename_as=rename_as,
307
- id_col=id_col,
308
- rank_and_fill=rank_and_fill,
309
- )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev4"
3
+ version = "0.5.0.dev6"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}