numerai-tools 0.5.0.dev2__tar.gz → 0.5.0.dev4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev2
3
+ Version: 0.5.0.dev4
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -26,3 +26,15 @@ Description-Content-Type: text/markdown
26
26
  # numerai-tools
27
27
  A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
28
28
 
29
+ ## Installation
30
+ ```
31
+ pip install numerai-tools
32
+ ```
33
+
34
+ ## Structure
35
+
36
+ - The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
37
+
38
+ - The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
39
+
40
+ - The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
@@ -0,0 +1,15 @@
1
+ # numerai-tools
2
+ A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
3
+
4
+ ## Installation
5
+ ```
6
+ pip install numerai-tools
7
+ ```
8
+
9
+ ## Structure
10
+
11
+ - The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
12
+
13
+ - The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
14
+
15
+ - The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
@@ -1,4 +1,4 @@
1
- from typing import List, Tuple, Union, Optional, TypeVar, cast, Any
1
+ from typing import List, Literal, Tuple, Union, Optional, TypeVar, cast, Any
2
2
 
3
3
  import numpy as np
4
4
  import pandas as pd
@@ -14,6 +14,7 @@ DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
14
14
 
15
15
  S1 = TypeVar("S1", bound=Union[pd.DataFrame, pd.Series])
16
16
  S2 = TypeVar("S2", bound=Union[pd.DataFrame, pd.Series])
17
+ RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
17
18
 
18
19
 
19
20
  def filter_sort_index(
@@ -109,22 +110,38 @@ def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
109
110
  return pd.concat([top, bot]).sort_index()
110
111
 
111
112
 
112
- def rank(df: pd.DataFrame, method: str = "average") -> pd.DataFrame:
113
- """Percentile rank each column of a pandas DataFrame, centering values around 0.5
113
+ def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
114
+ """Percentile rank a pandas Series, centering values around 0.5.
114
115
 
115
116
  Arguments:
116
- df: pd.DataFrame - the data to rank
117
+ s: pd.Series - the data to rank
117
118
  method: str - the pandas ranking method to use, options:
118
119
  'average' (default) - keeps ties
119
120
  'first' - breaks ties by index
120
121
 
121
122
  Returns:
122
- pd.DataFrame - the ranked DataFrame
123
+ pd.Series - the ranked Series
123
124
  """
124
- assert np.array_equal(df.index.sort_values(), df.index), "unsorted index found"
125
- return df.apply(
126
- lambda series: (series.rank(method=method).values - 0.5) / series.count()
127
- )
125
+ assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
126
+ return (s.rank(method=method) - 0.5) / s.count()
127
+
128
+
129
+ def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
130
+ """Percentile rank each columns or series, centering values around 0.5
131
+
132
+ Arguments:
133
+ s: pd.DataFrame | pd.Series - the data to rank
134
+ method: str - the pandas ranking method to use, options:
135
+ 'average' (default) - keeps ties
136
+ 'first' - breaks ties by index
137
+
138
+ Returns:
139
+ pd.DataFrame | pd.Series - the ranked input data
140
+ """
141
+ if isinstance(s, pd.Series):
142
+ return cast(S1, rank_series(s, method))
143
+ else:
144
+ return s.apply(lambda series: rank(series, method=method))
128
145
 
129
146
 
130
147
  def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
@@ -132,9 +149,9 @@ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
132
149
  return rank(df, "first")
133
150
 
134
151
 
135
- def tie_kept_rank(df: pd.DataFrame) -> pd.DataFrame:
152
+ def tie_kept_rank(s: S1) -> S1:
136
153
  """Rank columns, but keep ties."""
137
- return rank(df, "average")
154
+ return cast(S1, rank(s, "average"))
138
155
 
139
156
 
140
157
  def min_max_normalize(s: pd.Series) -> pd.Series:
@@ -539,14 +556,26 @@ def max_feature_correlation(
539
556
 
540
557
 
541
558
  def generate_neutralized_weights(
542
- predictions: pd.Series,
559
+ predictions: pd.DataFrame,
543
560
  neutralizers: pd.DataFrame,
544
561
  sample_weights: pd.Series,
562
+ center_and_normalize: bool = False,
545
563
  ) -> pd.Series:
546
- neutral_preds = predictions - (
547
- neutralizers @ (neutralizers.T @ (sample_weights * predictions))
564
+ assert not predictions.isna().any().any(), "Predictions contain NaNs"
565
+ assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
566
+ assert not sample_weights.isna().any(), "Weights contain NaNs"
567
+ ranked_predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
568
+ ranked_predictions, neutralizers, sample_weights = filter_sort_index_many(
569
+ [ranked_predictions, neutralizers, sample_weights]
570
+ )
571
+ neutral_weights = ranked_predictions.apply(
572
+ lambda s_prime: (
573
+ s_prime - neutralizers @ (neutralizers.T @ (sample_weights * s_prime))
574
+ )
575
+ * sample_weights
548
576
  )
549
- neutral_weights = neutral_preds * sample_weights
577
+ if center_and_normalize:
578
+ neutral_weights = weight_normalize(center(neutral_weights))
550
579
  return neutral_weights
551
580
 
552
581
 
@@ -568,18 +597,8 @@ def alpha(
568
597
  targets: pd.Series - the live targets to evaluate against
569
598
  """
570
599
  targets = center(targets)
571
- assert not predictions.isna().any().any(), "Predictions contain NaNs"
572
- assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
573
- assert not sample_weights.isna().any(), "Weights contain NaNs"
574
- predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
575
- [predictions, neutralizers, sample_weights, targets]
576
- )
577
- ranked_preds = tie_kept_rank__gaussianize__pow_1_5(predictions)
578
- weights = ranked_preds.apply(
579
- lambda s_prime: generate_neutralized_weights(
580
- s_prime, neutralizers, sample_weights
581
- )
582
- )
600
+ predictions, targets = filter_sort_index(predictions, targets)
601
+ weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
583
602
  alpha_scores = weights.apply(lambda w: w @ targets) / len(targets)
584
603
  return alpha_scores
585
604
 
@@ -605,19 +624,10 @@ def meta_portfolio_contribution(
605
624
  targets: pd.Series - the live targets to evaluate against
606
625
  """
607
626
  targets = center(targets)
608
- assert not predictions.isna().any().any(), "Predictions contain NaNs"
609
- assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
610
- assert not sample_weights.isna().any(), "Weights contain NaNs"
611
- predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
612
- [predictions, neutralizers, sample_weights, targets]
613
- )
627
+ predictions, targets = filter_sort_index(predictions, targets)
614
628
  stake_weights = weight_normalize(stakes.fillna(0))
615
629
  assert np.isclose(stake_weights.sum(), 1), "Stakes must sum to 1"
616
- weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
617
- lambda s_prime: generate_neutralized_weights(
618
- s_prime, neutralizers, sample_weights
619
- )
620
- )
630
+ weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
621
631
  w = cast(np.ndarray, weights[stakes.index].values)
622
632
  s = cast(np.ndarray, stake_weights.values)
623
633
  t = cast(np.ndarray, targets.values)
@@ -1,16 +1,14 @@
1
1
  from typing import Tuple, Optional
2
2
 
3
- from numerai_tools.submissions import validate_headers_signals, validate_ids_signals
4
3
  from numerai_tools.scoring import (
5
4
  filter_sort_index,
6
5
  filter_sort_top_bottom,
7
6
  spearman_correlation,
8
- tie_kept_rank,
9
- tie_kept_rank__gaussianize__pow_1_5,
10
- filter_sort_index_many,
11
7
  generate_neutralized_weights,
12
- weight_normalize,
13
- center,
8
+ )
9
+ from numerai_tools.submissions import (
10
+ validate_and_clean_submission_signals,
11
+ remap_ids,
14
12
  )
15
13
 
16
14
  import pandas as pd
@@ -79,48 +77,6 @@ def turnover(
79
77
  return turnover
80
78
 
81
79
 
82
- def neutral_weight(
83
- submission: pd.Series,
84
- signal_col: str,
85
- neutralizer: pd.DataFrame,
86
- weight: pd.Series,
87
- ) -> pd.Series:
88
- s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())
89
- s_prime, neutralizer, weight = filter_sort_index_many(
90
- [s_prime, neutralizer, weight]
91
- )
92
- neutral_weights = generate_neutralized_weights(
93
- s_prime[signal_col], neutralizer, weight
94
- )
95
- neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
96
- return neutral_weights.sort_index()
97
-
98
-
99
- def remap_ticker_col(
100
- predictions: pd.DataFrame,
101
- universe: pd.DataFrame,
102
- ticker_col: str,
103
- ) -> pd.DataFrame:
104
- return (
105
- predictions.join(universe, how="right")
106
- .reset_index()
107
- .set_index(ticker_col)
108
- .sort_index()
109
- )
110
-
111
-
112
- def rank_and_fill_signal(
113
- universe: pd.DataFrame,
114
- submission: pd.Series,
115
- signal_col: str,
116
- ) -> pd.Series:
117
- uni_joined_sub = universe.sort_index().join(
118
- tie_kept_rank(submission.sort_index().to_frame())
119
- )[[signal_col]]
120
- filled_sub = uni_joined_sub.fillna(uni_joined_sub.median()).sort_index()
121
- return filled_sub[signal_col]
122
-
123
-
124
80
  def calculate_max_churn_and_turnover(
125
81
  curr_sub: pd.DataFrame,
126
82
  curr_neutralizer: pd.DataFrame,
@@ -141,7 +97,7 @@ def calculate_max_churn_and_turnover(
141
97
  prev_week_subs -- a dictionary of datestamps to submissions
142
98
  prev_neutralizers -- a dictionary of datestamps to neutralizers
143
99
  prev_sample_weights -- a dictionary of datestamps to sample weights
144
- universe -- the internal universe DataFrame
100
+ universe -- the universe DataFrame for the current era
145
101
  curr_signal_col -- the column name for signal in the current submission
146
102
  curr_ticker_col -- the column name for tickers in the current submission
147
103
 
@@ -149,49 +105,48 @@ def calculate_max_churn_and_turnover(
149
105
  prev_week_max_churn -- the maximum churn from previous submissions
150
106
  prev_week_max_turnover -- the maximum turnover from previous submissions
151
107
  """
152
- curr_sub_vector: pd.Series = rank_and_fill_signal(
153
- universe,
154
- curr_sub.reset_index().set_index(curr_ticker_col).sort_index()[curr_signal_col],
155
- curr_signal_col,
108
+ universe = universe.reset_index()
109
+ curr_sub_vector = validate_and_clean_submission_signals(
110
+ universe=universe,
111
+ submission=curr_sub,
112
+ id_col=curr_ticker_col,
113
+ rename_as=curr_signal_col,
114
+ rank_and_fill=True,
156
115
  )
157
116
  churn_stats = []
158
117
  turnover_stats = []
159
- neutralized_weights = neutral_weight(
160
- curr_sub_vector, curr_signal_col, curr_neutralizer, curr_weight
118
+ neutralized_weights = generate_neutralized_weights(
119
+ curr_sub_vector.to_frame(), curr_neutralizer, curr_weight
161
120
  )
162
121
  for datestamp in prev_week_subs:
163
122
  prev_sub = prev_week_subs[datestamp]
164
123
  prev_neutralizer = prev_neutralizers[datestamp]
165
124
  prev_weight = prev_sample_weights[datestamp]
166
- prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub)
167
- prev_universe = universe.reset_index().set_index(prev_ticker_col)
168
- filtered_prev_sub_df, _ = validate_ids_signals(
169
- prev_universe.index.to_series(), prev_sub, prev_ticker_col
170
- )
171
- # in case the previous submission has a different ticker column,
172
- # remap the ticker column of prev data to the current ticker column
173
- filtered_prev_sub = remap_ticker_col(
174
- filtered_prev_sub_df.set_index(prev_ticker_col),
175
- universe=prev_universe,
176
- ticker_col=curr_ticker_col,
177
- )[curr_signal_col]
178
- filtered_prev_sub = rank_and_fill_signal(
125
+ filtered_prev_sub = validate_and_clean_submission_signals(
179
126
  universe=universe,
180
- submission=filtered_prev_sub,
181
- signal_col=curr_signal_col,
127
+ submission=prev_sub,
128
+ id_col=curr_ticker_col,
129
+ rename_as=curr_signal_col,
130
+ rank_and_fill=True,
131
+ )
132
+ prev_neutralizer = (
133
+ remap_ids(
134
+ prev_neutralizer.reset_index(),
135
+ universe,
136
+ str(prev_neutralizer.index.name),
137
+ curr_ticker_col,
138
+ )
139
+ .set_index(curr_ticker_col)
140
+ .filter(like="neutralizer_")
182
141
  )
183
- prev_neutralizer = remap_ticker_col(
184
- prev_neutralizer,
185
- universe=prev_universe,
186
- ticker_col=curr_ticker_col,
187
- ).filter(like="neutralizer_")
188
- prev_weight = remap_ticker_col(
189
- prev_weight.to_frame(),
190
- universe=prev_universe,
191
- ticker_col=curr_ticker_col,
192
- )[prev_weight.name]
193
- prev_neutralized_weights = neutral_weight(
194
- filtered_prev_sub, prev_signal_col, prev_neutralizer, prev_weight
142
+ prev_weight = remap_ids(
143
+ prev_weight.reset_index(),
144
+ universe,
145
+ str(prev_weight.index.name),
146
+ curr_ticker_col,
147
+ ).set_index(curr_ticker_col)[prev_weight.name]
148
+ prev_neutralized_weights = generate_neutralized_weights(
149
+ filtered_prev_sub.to_frame(), prev_neutralizer, prev_weight
195
150
  )
196
151
  try:
197
152
  churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
@@ -1,7 +1,7 @@
1
1
  from numerai_tools.scoring import tie_kept_rank
2
2
 
3
3
  import logging
4
- from typing import Tuple, List
4
+ from typing import Tuple, List, Optional
5
5
 
6
6
  import pandas as pd
7
7
  import numpy as np
@@ -49,9 +49,10 @@ def _validate_headers(
49
49
  ]
50
50
  columns = submission.columns
51
51
  valid_headers = list(columns) in expected_headers
52
- assert (
53
- valid_headers
54
- ), f"headers must be one of {expected_id_cols} and one of {expected_pred_cols}"
52
+ assert valid_headers, (
53
+ "invalid_submission_headers: headers must be one of"
54
+ f" {expected_id_cols} and one of {expected_pred_cols}"
55
+ )
55
56
  return columns[0], columns[1]
56
57
 
57
58
 
@@ -62,17 +63,6 @@ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
62
63
 
63
64
 
64
65
  def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
65
- if "data_type" in submission.columns:
66
- logger.warning(
67
- "data_type column found in Signals submission. This is deprecated and will be removed in the future. "
68
- "Please remove the data_type column from your Signals submission."
69
- )
70
- date_col = [
71
- date_col
72
- for date_col in SIGNALS_ALLOWED_DATE_COLS
73
- if date_col in list(submission.columns)
74
- ]
75
- submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
76
66
  return _validate_headers(
77
67
  SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
78
68
  )
@@ -95,13 +85,13 @@ def validate_values(submission: pd.DataFrame, prediction_col: str) -> None:
95
85
  """
96
86
  assert (
97
87
  submission[prediction_col].isna().sum() == 0
98
- ), "submission must not contain NaNs"
88
+ ), "invalid_submission_values: submission must not contain NaNs"
99
89
  assert (
100
90
  submission[prediction_col].between(0, 1).all()
101
- ), "values must be between 0 and 1 exclusive"
91
+ ), "invalid_submission_values: values must be between 0 and 1 exclusive"
102
92
  assert not np.isclose(
103
93
  0, submission[prediction_col].std()
104
- ), "submission must have non-zero standard deviation"
94
+ ), "invalid_submission_values: submission must have non-zero standard deviation"
105
95
 
106
96
 
107
97
  def _validate_ids(
@@ -127,7 +117,7 @@ def _validate_ids(
127
117
  """
128
118
  assert (
129
119
  not submission[id_col].isna().any()
130
- ), f"Submission must not contain NaNs in the {id_col} column."
120
+ ), f"invalid_submission_ids: Submission must not contain NaNs in the {id_col} column."
131
121
 
132
122
  index_sub = submission.copy()
133
123
  index_sub[id_col] = index_sub[id_col].astype(str)
@@ -136,12 +126,13 @@ def _validate_ids(
136
126
  live_sub = index_sub[index_sub[id_col].isin(live_ids)].sort_values(id_col)
137
127
  assert (
138
128
  not live_sub[id_col].duplicated().any()
139
- ), f"Duplicates detected in {id_col} for live period."
129
+ ), f"invalid_submission_ids: Duplicates detected in {id_col} for live period."
140
130
 
141
131
  # join on live_ids and ensure min tickers reached
142
- assert (
143
- len(live_sub) >= min_tickers
144
- ), f"Not enough stocks submitted. Are you using the latest live ids or live universe?"
132
+ assert len(live_sub) >= min_tickers, (
133
+ "invalid_submission_ids: Not enough stocks submitted."
134
+ " Are you using the latest live ids or live universe?"
135
+ )
145
136
 
146
137
  invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
147
138
  return live_sub, invalid_tickers
@@ -165,12 +156,34 @@ def validate_ids_crypto(
165
156
  return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
166
157
 
167
158
 
168
- def clean_predictions(
169
- live_ids: pd.Series,
159
+ def remap_ids(
160
+ data: pd.DataFrame,
161
+ ticker_map: pd.Series | pd.DataFrame,
162
+ src_id_col: str,
163
+ dst_id_col: str,
164
+ ) -> pd.DataFrame:
165
+ # first, index the universe and data on the source ids
166
+ indexed_map = ticker_map.reset_index().set_index(src_id_col)
167
+ indexed_data = data.set_index(src_id_col)
168
+ return (
169
+ # then, join the universe and data
170
+ indexed_map.join(indexed_data)
171
+ # get just the destination ids and prediction columns
172
+ .reset_index()[[dst_id_col, *indexed_data.columns]]
173
+ # finally, sort by the destination ticker column
174
+ .sort_values(dst_id_col)
175
+ )
176
+
177
+
178
+ def clean_submission(
179
+ live_ids: pd.Series | pd.DataFrame,
170
180
  predictions: pd.DataFrame,
181
+ ticker_col: str,
182
+ signal_col: str,
183
+ rename_as: Optional[str],
171
184
  id_col: str,
172
185
  rank_and_fill: bool,
173
- ) -> pd.DataFrame:
186
+ ) -> pd.Series:
174
187
  """Prepare predictions for submission to Numerai.
175
188
  Filters out ids not in live data, drops duplicates, sets ids as index,
176
189
  then optionally ranks (keeping ties) and fills NaNs with 0.5.
@@ -182,28 +195,115 @@ def clean_predictions(
182
195
  Arguments:
183
196
  live_ids: pd.Series - the ids in the live data
184
197
  predictions: pd.DataFrame - the predictions to clean
198
+ ticker_col: str - the name of the ids column
199
+ signal_col: str - the name of the predictions column
200
+ rename_as: Optional[str] - the string to which the submission should be renamed
185
201
  id_col: str - the column name of the ids
186
202
  rank_and_fill: bool - whether to rank and fill NaNs with 0.5
187
- left_join_ids: bool - whether to left join the predictions onto the ids
203
+
204
+ Returns:
205
+ pd.Series - the cleaned prediction series with ids as index
188
206
  """
189
207
  assert len(live_ids) > 0, "live_ids must not be empty"
190
- assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
208
+ if isinstance(live_ids, pd.DataFrame):
209
+ assert live_ids.isna().sum().sum() == 0, "live_ids must not contain NaNs"
210
+ else:
211
+ assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
191
212
  assert len(predictions) > 0, "predictions must not be empty"
192
213
 
193
- # drop null indices
194
- predictions = predictions[~predictions[id_col].isna()]
195
- predictions = (
196
- predictions[
197
- # filter out ids not in live data
198
- predictions[id_col].isin(live_ids)
199
- ]
200
- # drop duplicate ids (keep first)
214
+ clean_preds = (
215
+ remap_ids(predictions, live_ids, ticker_col, id_col)
216
+ # drop NaNs and duplicates
217
+ .dropna(subset=[id_col])
201
218
  .drop_duplicates(subset=id_col, keep="first")
202
- # set ids as index
219
+ # set ids as index and sort
203
220
  .set_index(id_col)
204
221
  .sort_index()
205
- )
222
+ # rename to given name
223
+ .rename(columns={signal_col: rename_as})
224
+ )[rename_as]
206
225
  # rank and fill with 0.5
207
226
  if rank_and_fill:
208
- predictions = tie_kept_rank(predictions).fillna(0.5)
209
- return predictions
227
+ clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
228
+ return clean_preds
229
+
230
+
231
+ def validate_and_clean_submission_numerai(
232
+ universe: pd.Series,
233
+ submission: pd.DataFrame,
234
+ id_col: str = "id",
235
+ rename_as: Optional[str] = None,
236
+ rank_and_fill: bool = False,
237
+ ) -> pd.Series:
238
+ ticker_col, signal_col = validate_headers_numerai(submission)
239
+ filtered_sub, invalid_tickers = validate_ids_numerai(
240
+ universe, submission, ticker_col
241
+ )
242
+ validate_values(filtered_sub, signal_col)
243
+ return clean_submission(
244
+ live_ids=universe,
245
+ predictions=filtered_sub,
246
+ ticker_col=ticker_col,
247
+ signal_col=signal_col,
248
+ rename_as=rename_as,
249
+ id_col=id_col,
250
+ rank_and_fill=rank_and_fill,
251
+ )
252
+
253
+
254
+ def validate_and_clean_submission_signals(
255
+ universe: pd.DataFrame,
256
+ submission: pd.DataFrame,
257
+ id_col: str,
258
+ rename_as: Optional[str] = None,
259
+ rank_and_fill: bool = True,
260
+ ) -> pd.Series:
261
+ # drop data_type and date columns if they exist
262
+ if "data_type" in submission.columns:
263
+ logger.warning(
264
+ "data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
265
+ "Please remove the data_type column from your Signals submission."
266
+ )
267
+ date_col = [
268
+ date_col
269
+ for date_col in SIGNALS_ALLOWED_DATE_COLS
270
+ if date_col in list(submission.columns)
271
+ ]
272
+ submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
273
+ ticker_col, signal_col = validate_headers_signals(submission)
274
+ filtered_sub, invalid_tickers = validate_ids_signals(
275
+ universe[ticker_col], submission, ticker_col
276
+ )
277
+ validate_values(filtered_sub, signal_col)
278
+ return clean_submission(
279
+ live_ids=universe,
280
+ predictions=filtered_sub,
281
+ ticker_col=ticker_col,
282
+ signal_col=signal_col,
283
+ rename_as=rename_as,
284
+ id_col=id_col,
285
+ rank_and_fill=rank_and_fill,
286
+ )
287
+
288
+
289
+ def validate_and_clean_submission_crypto(
290
+ universe: pd.DataFrame,
291
+ submission: pd.DataFrame,
292
+ id_col: str = "symbol",
293
+ rename_as: Optional[str] = None,
294
+ rank_and_fill: bool = True,
295
+ ):
296
+ ticker_col, signal_col = validate_headers_crypto(submission)
297
+ filtered_sub, invalid_tickers = validate_ids_crypto(
298
+ universe[ticker_col], submission, ticker_col
299
+ )
300
+ validate_values(filtered_sub, signal_col)
301
+ return clean_submission(
302
+ live_ids=universe,
303
+ predictions=filtered_sub,
304
+ ticker_col=ticker_col,
305
+ signal_col=signal_col,
306
+ rename_as=rename_as,
307
+ id_col=id_col,
308
+ rank_and_fill=rank_and_fill,
309
+ )
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "numerai-tools"
3
- version = "0.5.0.dev2"
3
+ version = "0.5.0.dev4"
4
4
  description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
5
  authors = [
6
6
  {name = "Numerai Engineering",email = "engineering@numer.ai"}
@@ -28,6 +28,11 @@ documentation = "https://docs.numer.ai/"
28
28
  packages = [
29
29
  {include = "numerai_tools", from = "."},
30
30
  ]
31
+ include = [
32
+ { path = "LICENSE", format = ["sdist", "wheel"] },
33
+ { path = "README.md", format = ["sdist", "wheel"] },
34
+ { path = "numerai_tools/py.typed", format = ["sdist", "wheel"] }
35
+ ]
31
36
 
32
37
  [tool.poetry.dependencies]
33
38
  pandas = "^2.2.2"
@@ -39,6 +44,8 @@ scikit-learn = "^1.5.0"
39
44
  pytest = "^8.3.4"
40
45
  mypy = "^1.15.0"
41
46
  ruff = "^0.5.4"
47
+ pandas-stubs = "^2.3.0.250703"
48
+ scipy-stubs = "^1.16.1.0"
42
49
 
43
50
  [build-system]
44
51
  requires = ["poetry-core>=2.0.0,<3.0.0"]
@@ -1,2 +0,0 @@
1
- # numerai-tools
2
- A collection of open-source tools to help interact with Numerai, model data, and automate submissions.