numerai-tools 0.5.0.dev9__tar.gz → 0.5.0.dev11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev9
3
+ Version: 0.5.0.dev11
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -560,7 +560,7 @@ def generate_neutralized_weights(
560
560
  neutralizers: pd.DataFrame,
561
561
  sample_weights: pd.Series,
562
562
  center_and_normalize: bool = False,
563
- ) -> pd.Series:
563
+ ) -> pd.DataFrame:
564
564
  assert not predictions.isna().any().any(), "Predictions contain NaNs"
565
565
  assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
566
566
  assert not sample_weights.isna().any(), "Weights contain NaNs"
@@ -76,106 +76,92 @@ def turnover(
76
76
 
77
77
 
78
78
  def calculate_max_churn_and_turnover(
79
- curr_sub: pd.DataFrame,
79
+ curr_sub: pd.Series,
80
80
  curr_neutralizer: pd.DataFrame,
81
- curr_weight: pd.Series,
82
- prev_week_subs: dict[str, pd.DataFrame],
81
+ curr_sample_weight: pd.Series,
82
+ prev_subs: dict[str, pd.Series],
83
83
  prev_neutralizers: dict[str, pd.DataFrame],
84
84
  prev_sample_weights: dict[str, pd.Series],
85
- universe: pd.DataFrame,
86
- curr_signal_col: str,
87
- curr_ticker_col: str,
88
85
  ) -> Tuple[float, float]:
89
86
  """Calculate the maximum churn and turnover with respect to previous submissions.
87
+ This function iterates over previous submissions and calculates churn and turnover
88
+ for each submission against the current submission. It expects all data to be
89
+ indexed on the same type tickers/IDs (e.g. all numerai_ticker, or all composite_figi, or all etc.) .
90
90
 
91
91
  Arguments:
92
- curr_sub -- the current submission
93
- curr_neutralizer -- the neutralizer DataFrame for the current submission
94
- curr_weight -- the sample weights Series for the current submission
95
- prev_week_subs -- a dictionary of datestamps to submissions
96
- prev_neutralizers -- a dictionary of datestamps to neutralizers
97
- prev_sample_weights -- a dictionary of datestamps to sample weights
98
- universe -- the universe DataFrame for the current era
99
- curr_signal_col -- the column name for signal in the current submission
100
- curr_ticker_col -- the column name for tickers in the current submission
101
-
92
+ curr_sub: pd.Series - the current submission as a Series indexed on tickers/ids
93
+ curr_neutralizer: pd.DataFrame - the neutralizer DataFrame for the current submission indexed on numerai_ticker
94
+ curr_sample_weight: pd.Series - the sample weights Series for the current submission indexed on numerai_ticker
95
+ prev_subs: dict[str, pd.DataFrame] - a dictionary of datestamps to submissions, where each submission is a DataFrame
96
+ with 2 columns: a ticker/id column and a signal/prediction column. To calculate churn
97
+ and turnover for a live submission, use the most recent 5 submissions. For diagnostics,
98
+ just provide the previous era.
99
+ prev_neutralizers: dict[str, pd.DataFrame] - a dictionary of datestamps to neutralizers DataFrames where each neutralizers
100
+ DataFrame is indexed on the same ticker column as the current submission
101
+ prev_sample_weights: dict[str, pd.Series] - a dictionary of datestamps to sample weights where each sample weights
102
+ Series is indexed on the same ticker column as the current submission
102
103
  Returns:
103
104
  prev_week_max_churn -- the maximum churn from previous submissions
104
105
  prev_week_max_turnover -- the maximum turnover from previous submissions
105
106
  """
106
- universe = universe.reset_index()
107
107
  (
108
108
  curr_ticker_col,
109
109
  curr_signal_col,
110
110
  _,
111
- curr_sub,
111
+ curr_sub_df,
112
112
  _,
113
113
  ) = validate_submission_signals(
114
- universe=universe,
115
- submission=curr_sub,
114
+ universe=curr_sample_weight.index.to_frame(),
115
+ submission=curr_sub.reset_index(),
116
116
  )
117
- curr_sub_vector = clean_submission(
118
- universe=universe,
119
- submission=curr_sub,
117
+ curr_sub = clean_submission(
118
+ universe=curr_sample_weight.index.to_frame(),
119
+ submission=curr_sub_df,
120
120
  src_id_col=curr_ticker_col,
121
121
  src_signal_col=curr_signal_col,
122
122
  rank_and_fill=True,
123
123
  )
124
+ print("curr_sub", curr_sub)
124
125
  churn_stats = []
125
126
  turnover_stats = []
126
127
  neutralized_weights = generate_neutralized_weights(
127
- curr_sub_vector.to_frame(),
128
+ curr_sub.to_frame(),
128
129
  curr_neutralizer,
129
- curr_weight,
130
+ curr_sample_weight,
130
131
  center_and_normalize=True,
131
- )
132
- for datestamp in prev_week_subs:
133
- prev_sub = prev_week_subs[datestamp]
132
+ )[curr_sub.name]
133
+ for datestamp in prev_subs:
134
+ prev_sub = prev_subs[datestamp]
134
135
  prev_neutralizer = prev_neutralizers[datestamp]
135
- prev_weight = prev_sample_weights[datestamp]
136
+ prev_sample_weight = prev_sample_weights[datestamp]
136
137
  (
137
138
  prev_ticker_col,
138
139
  prev_signal_col,
139
140
  _,
140
- prev_sub,
141
+ prev_sub_df,
141
142
  _,
142
143
  ) = validate_submission_signals(
143
- universe=universe,
144
- submission=prev_sub,
144
+ universe=prev_sample_weight.index.to_frame(),
145
+ submission=prev_sub.reset_index(),
145
146
  )
146
- filtered_prev_sub = clean_submission(
147
- universe=universe,
148
- submission=prev_sub,
147
+ prev_sub = clean_submission(
148
+ universe=prev_sample_weight.index.to_frame(),
149
+ submission=prev_sub_df,
149
150
  src_id_col=prev_ticker_col,
150
151
  src_signal_col=prev_signal_col,
151
152
  dst_id_col=curr_ticker_col,
152
153
  dst_signal_col=curr_signal_col,
153
154
  rank_and_fill=True,
154
155
  )
155
- prev_neutralizer = (
156
- remap_ids(
157
- prev_neutralizer.reset_index(),
158
- universe,
159
- str(prev_neutralizer.index.name),
160
- curr_ticker_col,
161
- )
162
- .set_index(curr_ticker_col)
163
- .filter(like="neutralizer_")
164
- )
165
- prev_weight = remap_ids(
166
- prev_weight.reset_index(),
167
- universe,
168
- str(prev_weight.index.name),
169
- curr_ticker_col,
170
- ).set_index(curr_ticker_col)[prev_weight.name]
171
156
  prev_neutralized_weights = generate_neutralized_weights(
172
- filtered_prev_sub.to_frame(),
157
+ prev_sub.to_frame(),
173
158
  prev_neutralizer,
174
- prev_weight,
159
+ prev_sample_weight,
175
160
  center_and_normalize=True,
176
- )
161
+ )[prev_sub.name]
162
+ print("prev_sub", prev_sub)
177
163
  try:
178
- churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
164
+ churn_val = abs(churn(curr_sub, prev_sub))
179
165
  except AssertionError as e:
180
166
  if "does not have enough overlapping ids" in str(e):
181
167
  continue
@@ -88,6 +88,7 @@ def validate_headers_signals(
88
88
  assert (
89
89
  date_col_name is not None
90
90
  ), "invalid_submission_headers: submission must contain a date column"
91
+ print(submission)
91
92
  ticker_col, signal_col = _validate_headers(
92
93
  submission,
93
94
  SIGNALS_ALLOWED_ID_COLS,
@@ -228,11 +229,8 @@ def validate_submission_signals(
228
229
  "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
229
230
  "Please remove the data_type column from your Signals submission."
230
231
  )
231
- submission.drop(
232
- columns=["data_type"],
233
- errors="ignore",
234
- inplace=True,
235
- )
232
+ submission.drop(columns=["data_type"], errors="ignore", inplace=True)
233
+ print(submission)
236
234
  ticker_col, signal_col, date_col = validate_headers_signals(
237
235
  submission, assert_date_col
238
236
  )
@@ -266,7 +264,7 @@ def validate_submission_crypto(
266
264
 
267
265
  def remap_ids(
268
266
  data: pd.DataFrame,
269
- ticker_map: pd.Series | pd.DataFrame,
267
+ ticker_map: pd.DataFrame,
270
268
  src_id_col: str,
271
269
  dst_id_col: str,
272
270
  ) -> pd.DataFrame:
@@ -277,25 +275,25 @@ def remap_ids(
277
275
 
278
276
  Arguments:
279
277
  data: pd.DataFrame - the data to remap
280
- ticker_map: pd.Series | pd.DataFrame - the mapping of source ids to destination ids
278
+ ticker_map: pd.DataFrame - the mapping of source ids to destination ids
281
279
  src_id_col: str - the name of the source ids column in the data
282
280
  dst_id_col: str - the name of the destination ids column in the ticker map
283
281
  """
284
282
  # first, index the universe and data on the source ids
285
- indexed_map = ticker_map.reset_index().set_index(src_id_col)
283
+ indexed_map = ticker_map.set_index(src_id_col, drop=False)
286
284
  indexed_data = data.set_index(src_id_col)
287
285
  return (
288
286
  # then, join the universe and data
289
287
  indexed_map.join(indexed_data)
290
288
  # get just the destination ids and prediction columns
291
- .reset_index()[[dst_id_col, *indexed_data.columns]]
289
+ .reset_index(drop=True)[[dst_id_col, *indexed_data.columns]]
292
290
  # finally, sort by the destination ticker column
293
291
  .sort_values(dst_id_col)
294
292
  )
295
293
 
296
294
 
297
295
  def clean_submission(
298
- universe: pd.Series | pd.DataFrame,
296
+ universe: pd.DataFrame,
299
297
  submission: pd.DataFrame,
300
298
  src_id_col: str,
301
299
  src_signal_col: str,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev9"
3
+ version = "0.5.0.dev11"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}