numerai-tools 0.5.0.dev10__tar.gz → 0.5.0.dev12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev10
3
+ Version: 0.5.0.dev12
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -560,7 +560,7 @@ def generate_neutralized_weights(
560
560
  neutralizers: pd.DataFrame,
561
561
  sample_weights: pd.Series,
562
562
  center_and_normalize: bool = False,
563
- ) -> pd.Series:
563
+ ) -> pd.DataFrame:
564
564
  assert not predictions.isna().any().any(), "Predictions contain NaNs"
565
565
  assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
566
566
  assert not sample_weights.isna().any(), "Weights contain NaNs"
@@ -9,7 +9,6 @@ from numerai_tools.scoring import (
9
9
  from numerai_tools.submissions import (
10
10
  validate_submission_signals,
11
11
  clean_submission,
12
- remap_ids,
13
12
  )
14
13
 
15
14
  import pandas as pd
@@ -76,47 +75,47 @@ def turnover(
76
75
 
77
76
 
78
77
  def calculate_max_churn_and_turnover(
79
- curr_sub: pd.DataFrame,
78
+ curr_sub: pd.Series,
80
79
  curr_neutralizer: pd.DataFrame,
81
- curr_weight: pd.Series,
82
- prev_week_subs: dict[str, pd.DataFrame],
80
+ curr_sample_weight: pd.Series,
81
+ prev_subs: dict[str, pd.Series],
83
82
  prev_neutralizers: dict[str, pd.DataFrame],
84
83
  prev_sample_weights: dict[str, pd.Series],
85
- universe: pd.DataFrame,
86
- curr_signal_col: str,
87
- curr_ticker_col: str,
88
84
  ) -> Tuple[float, float]:
89
85
  """Calculate the maximum churn and turnover with respect to previous submissions.
86
+ This function iterates over previous submissions and calculates churn and turnover
87
+ for each submission against the current submission. It expects all data to be
88
+ indexed on the same type tickers/IDs (e.g. all numerai_ticker, or all composite_figi, or all etc.) .
90
89
 
91
90
  Arguments:
92
- curr_sub -- the current submission
93
- curr_neutralizer -- the neutralizer DataFrame for the current submission
94
- curr_weight -- the sample weights Series for the current submission
95
- prev_week_subs -- a dictionary of datestamps to submissions
96
- prev_neutralizers -- a dictionary of datestamps to neutralizers
97
- prev_sample_weights -- a dictionary of datestamps to sample weights
98
- universe -- the universe DataFrame for the current era
99
- curr_signal_col -- the column name for signal in the current submission
100
- curr_ticker_col -- the column name for tickers in the current submission
101
-
91
+ curr_sub: pd.Series - the current submission as a Series indexed on tickers/ids
92
+ curr_neutralizer: pd.DataFrame - the neutralizer DataFrame for the current submission indexed on numerai_ticker
93
+ curr_sample_weight: pd.Series - the sample weights Series for the current submission indexed on numerai_ticker
94
+ prev_subs: dict[str, pd.DataFrame] - a dictionary of datestamps to submissions, where each submission is a DataFrame
95
+ with 2 columns: a ticker/id column and a signal/prediction column. To calculate churn
96
+ and turnover for a live submission, use the most recent 5 submissions. For diagnostics,
97
+ just provide the previous era.
98
+ prev_neutralizers: dict[str, pd.DataFrame] - a dictionary of datestamps to neutralizers DataFrames where each neutralizers
99
+ DataFrame is indexed on the same ticker column as the current submission
100
+ prev_sample_weights: dict[str, pd.Series] - a dictionary of datestamps to sample weights where each sample weights
101
+ Series is indexed on the same ticker column as the current submission
102
102
  Returns:
103
103
  prev_week_max_churn -- the maximum churn from previous submissions
104
104
  prev_week_max_turnover -- the maximum turnover from previous submissions
105
105
  """
106
- universe = universe.reset_index()
107
106
  (
108
107
  curr_ticker_col,
109
108
  curr_signal_col,
110
109
  _,
111
- curr_sub,
110
+ curr_sub_df,
112
111
  _,
113
112
  ) = validate_submission_signals(
114
- universe=universe,
115
- submission=curr_sub,
113
+ universe=curr_sample_weight.index.to_frame(),
114
+ submission=curr_sub.reset_index(),
116
115
  )
117
- curr_sub_vector = clean_submission(
118
- universe=universe,
119
- submission=curr_sub,
116
+ curr_sub = clean_submission(
117
+ universe=curr_sample_weight.index.to_frame(),
118
+ submission=curr_sub_df,
120
119
  src_id_col=curr_ticker_col,
121
120
  src_signal_col=curr_signal_col,
122
121
  rank_and_fill=True,
@@ -124,63 +123,42 @@ def calculate_max_churn_and_turnover(
124
123
  churn_stats = []
125
124
  turnover_stats = []
126
125
  neutralized_weights = generate_neutralized_weights(
127
- curr_sub_vector.to_frame(),
126
+ curr_sub.to_frame(),
128
127
  curr_neutralizer,
129
- curr_weight,
128
+ curr_sample_weight,
130
129
  center_and_normalize=True,
131
- )[curr_sub_vector.name]
132
- for datestamp in prev_week_subs:
133
- prev_sub = prev_week_subs[datestamp]
130
+ )[curr_sub.name]
131
+ for datestamp in prev_subs:
132
+ prev_sub = prev_subs[datestamp]
134
133
  prev_neutralizer = prev_neutralizers[datestamp]
135
- prev_weight = prev_sample_weights[datestamp]
134
+ prev_sample_weight = prev_sample_weights[datestamp]
136
135
  (
137
136
  prev_ticker_col,
138
137
  prev_signal_col,
139
138
  _,
140
- prev_sub,
139
+ prev_sub_df,
141
140
  _,
142
141
  ) = validate_submission_signals(
143
- universe=universe,
144
- submission=prev_sub,
142
+ universe=prev_sample_weight.index.to_frame(),
143
+ submission=prev_sub.reset_index(),
145
144
  )
146
- filtered_prev_sub = clean_submission(
147
- universe=universe,
148
- submission=prev_sub,
145
+ prev_sub = clean_submission(
146
+ universe=prev_sample_weight.index.to_frame(),
147
+ submission=prev_sub_df,
149
148
  src_id_col=prev_ticker_col,
150
149
  src_signal_col=prev_signal_col,
151
150
  dst_id_col=curr_ticker_col,
152
151
  dst_signal_col=curr_signal_col,
153
152
  rank_and_fill=True,
154
153
  )
155
- prev_neutralizer = (
156
- remap_ids(
157
- prev_neutralizer.reset_index(),
158
- universe,
159
- str(prev_neutralizer.index.name),
160
- curr_ticker_col,
161
- )
162
- .set_index(curr_ticker_col)
163
- .filter(like="neutralizer_")
164
- .dropna()
165
- )
166
- prev_weight = (
167
- remap_ids(
168
- prev_weight.reset_index(),
169
- universe,
170
- str(prev_weight.index.name),
171
- curr_ticker_col,
172
- )
173
- .set_index(curr_ticker_col)[prev_weight.name]
174
- .dropna()
175
- )
176
154
  prev_neutralized_weights = generate_neutralized_weights(
177
- filtered_prev_sub.to_frame(),
155
+ prev_sub.to_frame(),
178
156
  prev_neutralizer,
179
- prev_weight,
157
+ prev_sample_weight,
180
158
  center_and_normalize=True,
181
- )[filtered_prev_sub.name]
159
+ )[prev_sub.name]
182
160
  try:
183
- churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
161
+ churn_val = abs(churn(curr_sub, prev_sub))
184
162
  except AssertionError as e:
185
163
  if "does not have enough overlapping ids" in str(e):
186
164
  continue
@@ -228,11 +228,7 @@ def validate_submission_signals(
228
228
  "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
229
229
  "Please remove the data_type column from your Signals submission."
230
230
  )
231
- submission.drop(
232
- columns=["data_type"],
233
- errors="ignore",
234
- inplace=True,
235
- )
231
+ submission.drop(columns=["data_type"], errors="ignore", inplace=True)
236
232
  ticker_col, signal_col, date_col = validate_headers_signals(
237
233
  submission, assert_date_col
238
234
  )
@@ -266,7 +262,7 @@ def validate_submission_crypto(
266
262
 
267
263
  def remap_ids(
268
264
  data: pd.DataFrame,
269
- ticker_map: pd.Series | pd.DataFrame,
265
+ ticker_map: pd.DataFrame,
270
266
  src_id_col: str,
271
267
  dst_id_col: str,
272
268
  ) -> pd.DataFrame:
@@ -277,25 +273,25 @@ def remap_ids(
277
273
 
278
274
  Arguments:
279
275
  data: pd.DataFrame - the data to remap
280
- ticker_map: pd.Series | pd.DataFrame - the mapping of source ids to destination ids
276
+ ticker_map: pd.DataFrame - the mapping of source ids to destination ids
281
277
  src_id_col: str - the name of the source ids column in the data
282
278
  dst_id_col: str - the name of the destination ids column in the ticker map
283
279
  """
284
280
  # first, index the universe and data on the source ids
285
- indexed_map = ticker_map.reset_index().set_index(src_id_col)
281
+ indexed_map = ticker_map.set_index(src_id_col, drop=False)
286
282
  indexed_data = data.set_index(src_id_col)
287
283
  return (
288
284
  # then, join the universe and data
289
285
  indexed_map.join(indexed_data)
290
286
  # get just the destination ids and prediction columns
291
- .reset_index()[[dst_id_col, *indexed_data.columns]]
287
+ .reset_index(drop=True)[[dst_id_col, *indexed_data.columns]]
292
288
  # finally, sort by the destination ticker column
293
289
  .sort_values(dst_id_col)
294
290
  )
295
291
 
296
292
 
297
293
  def clean_submission(
298
- universe: pd.Series | pd.DataFrame,
294
+ universe: pd.DataFrame,
299
295
  submission: pd.DataFrame,
300
296
  src_id_col: str,
301
297
  src_signal_col: str,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev10"
3
+ version = "0.5.0.dev12"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}