numerai-tools 0.5.0.dev1__tar.gz → 0.5.0.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numerai_tools-0.5.0.dev3/PKG-INFO +40 -0
- numerai_tools-0.5.0.dev3/README.md +15 -0
- {numerai_tools-0.5.0.dev1 → numerai_tools-0.5.0.dev3}/numerai_tools/scoring.py +48 -38
- {numerai_tools-0.5.0.dev1 → numerai_tools-0.5.0.dev3}/numerai_tools/signals.py +37 -82
- {numerai_tools-0.5.0.dev1 → numerai_tools-0.5.0.dev3}/numerai_tools/submissions.py +109 -28
- numerai_tools-0.5.0.dev3/pyproject.toml +52 -0
- numerai_tools-0.5.0.dev1/PKG-INFO +0 -22
- numerai_tools-0.5.0.dev1/README.md +0 -2
- numerai_tools-0.5.0.dev1/numerai_tools.egg-info/PKG-INFO +0 -22
- numerai_tools-0.5.0.dev1/numerai_tools.egg-info/SOURCES.txt +0 -16
- numerai_tools-0.5.0.dev1/numerai_tools.egg-info/dependency_links.txt +0 -1
- numerai_tools-0.5.0.dev1/numerai_tools.egg-info/requires.txt +0 -4
- numerai_tools-0.5.0.dev1/numerai_tools.egg-info/top_level.txt +0 -1
- numerai_tools-0.5.0.dev1/setup.cfg +0 -4
- numerai_tools-0.5.0.dev1/setup.py +0 -47
- numerai_tools-0.5.0.dev1/tests/test_scoring.py +0 -346
- numerai_tools-0.5.0.dev1/tests/test_signals.py +0 -139
- numerai_tools-0.5.0.dev1/tests/test_submissions.py +0 -498
- {numerai_tools-0.5.0.dev1 → numerai_tools-0.5.0.dev3}/LICENSE +0 -0
- {numerai_tools-0.5.0.dev1 → numerai_tools-0.5.0.dev3}/numerai_tools/__init__.py +0 -0
- {numerai_tools-0.5.0.dev1 → numerai_tools-0.5.0.dev3}/numerai_tools/py.typed +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: numerai-tools
|
|
3
|
+
Version: 0.5.0.dev3
|
|
4
|
+
Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Numerai Engineering
|
|
7
|
+
Author-email: engineering@numer.ai
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
18
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
19
|
+
Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
|
|
20
|
+
Requires-Dist: scipy (>=1.13.0,<2.0.0)
|
|
21
|
+
Project-URL: Documentation, https://docs.numer.ai/
|
|
22
|
+
Project-URL: Homepage, https://numer.ai
|
|
23
|
+
Project-URL: Repository, https://github.com/numerai/numerai-tools
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# numerai-tools
|
|
27
|
+
A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
```
|
|
31
|
+
pip install numerai-tools
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Structure
|
|
35
|
+
|
|
36
|
+
- The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
|
|
37
|
+
|
|
38
|
+
- The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
|
|
39
|
+
|
|
40
|
+
- The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# numerai-tools
|
|
2
|
+
A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
3
|
+
|
|
4
|
+
## Installation
|
|
5
|
+
```
|
|
6
|
+
pip install numerai-tools
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Structure
|
|
10
|
+
|
|
11
|
+
- The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
|
|
12
|
+
|
|
13
|
+
- The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
|
|
14
|
+
|
|
15
|
+
- The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Tuple, Union, Optional, TypeVar, cast, Any
|
|
1
|
+
from typing import List, Literal, Tuple, Union, Optional, TypeVar, cast, Any
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
import pandas as pd
|
|
@@ -14,6 +14,7 @@ DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
|
|
|
14
14
|
|
|
15
15
|
S1 = TypeVar("S1", bound=Union[pd.DataFrame, pd.Series])
|
|
16
16
|
S2 = TypeVar("S2", bound=Union[pd.DataFrame, pd.Series])
|
|
17
|
+
RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def filter_sort_index(
|
|
@@ -109,22 +110,38 @@ def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
|
|
|
109
110
|
return pd.concat([top, bot]).sort_index()
|
|
110
111
|
|
|
111
112
|
|
|
112
|
-
def
|
|
113
|
-
"""Percentile rank
|
|
113
|
+
def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
|
|
114
|
+
"""Percentile rank a pandas Series, centering values around 0.5.
|
|
114
115
|
|
|
115
116
|
Arguments:
|
|
116
|
-
|
|
117
|
+
s: pd.Series - the data to rank
|
|
117
118
|
method: str - the pandas ranking method to use, options:
|
|
118
119
|
'average' (default) - keeps ties
|
|
119
120
|
'first' - breaks ties by index
|
|
120
121
|
|
|
121
122
|
Returns:
|
|
122
|
-
pd.
|
|
123
|
+
pd.Series - the ranked Series
|
|
123
124
|
"""
|
|
124
|
-
assert np.array_equal(
|
|
125
|
-
return
|
|
126
|
-
|
|
127
|
-
|
|
125
|
+
assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
|
|
126
|
+
return (s.rank(method=method) - 0.5) / s.count()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
|
|
130
|
+
"""Percentile rank each columns or series, centering values around 0.5
|
|
131
|
+
|
|
132
|
+
Arguments:
|
|
133
|
+
s: pd.DataFrame | pd.Series - the data to rank
|
|
134
|
+
method: str - the pandas ranking method to use, options:
|
|
135
|
+
'average' (default) - keeps ties
|
|
136
|
+
'first' - breaks ties by index
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
pd.DataFrame | pd.Series - the ranked input data
|
|
140
|
+
"""
|
|
141
|
+
if isinstance(s, pd.Series):
|
|
142
|
+
return cast(S1, rank_series(s, method))
|
|
143
|
+
else:
|
|
144
|
+
return s.apply(lambda series: rank(series, method=method))
|
|
128
145
|
|
|
129
146
|
|
|
130
147
|
def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -132,9 +149,9 @@ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
132
149
|
return rank(df, "first")
|
|
133
150
|
|
|
134
151
|
|
|
135
|
-
def tie_kept_rank(
|
|
152
|
+
def tie_kept_rank(s: S1) -> S1:
|
|
136
153
|
"""Rank columns, but keep ties."""
|
|
137
|
-
return rank(
|
|
154
|
+
return cast(S1, rank(s, "average"))
|
|
138
155
|
|
|
139
156
|
|
|
140
157
|
def min_max_normalize(s: pd.Series) -> pd.Series:
|
|
@@ -539,14 +556,26 @@ def max_feature_correlation(
|
|
|
539
556
|
|
|
540
557
|
|
|
541
558
|
def generate_neutralized_weights(
|
|
542
|
-
predictions: pd.
|
|
559
|
+
predictions: pd.DataFrame,
|
|
543
560
|
neutralizers: pd.DataFrame,
|
|
544
561
|
sample_weights: pd.Series,
|
|
562
|
+
center_and_normalize: bool = False,
|
|
545
563
|
) -> pd.Series:
|
|
546
|
-
|
|
547
|
-
|
|
564
|
+
assert not predictions.isna().any().any(), "Predictions contain NaNs"
|
|
565
|
+
assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
|
|
566
|
+
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
567
|
+
ranked_predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
|
|
568
|
+
ranked_predictions, neutralizers, sample_weights = filter_sort_index_many(
|
|
569
|
+
[ranked_predictions, neutralizers, sample_weights]
|
|
570
|
+
)
|
|
571
|
+
neutral_weights = ranked_predictions.apply(
|
|
572
|
+
lambda s_prime: (
|
|
573
|
+
s_prime - neutralizers @ (neutralizers.T @ (sample_weights * s_prime))
|
|
574
|
+
)
|
|
575
|
+
* sample_weights
|
|
548
576
|
)
|
|
549
|
-
|
|
577
|
+
if center_and_normalize:
|
|
578
|
+
neutral_weights = weight_normalize(center(neutral_weights))
|
|
550
579
|
return neutral_weights
|
|
551
580
|
|
|
552
581
|
|
|
@@ -568,18 +597,8 @@ def alpha(
|
|
|
568
597
|
targets: pd.Series - the live targets to evaluate against
|
|
569
598
|
"""
|
|
570
599
|
targets = center(targets)
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
574
|
-
predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
|
|
575
|
-
[predictions, neutralizers, sample_weights, targets]
|
|
576
|
-
)
|
|
577
|
-
ranked_preds = tie_kept_rank__gaussianize__pow_1_5(predictions)
|
|
578
|
-
weights = ranked_preds.apply(
|
|
579
|
-
lambda s_prime: generate_neutralized_weights(
|
|
580
|
-
s_prime, neutralizers, sample_weights
|
|
581
|
-
)
|
|
582
|
-
)
|
|
600
|
+
predictions, targets = filter_sort_index(predictions, targets)
|
|
601
|
+
weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
|
|
583
602
|
alpha_scores = weights.apply(lambda w: w @ targets) / len(targets)
|
|
584
603
|
return alpha_scores
|
|
585
604
|
|
|
@@ -605,19 +624,10 @@ def meta_portfolio_contribution(
|
|
|
605
624
|
targets: pd.Series - the live targets to evaluate against
|
|
606
625
|
"""
|
|
607
626
|
targets = center(targets)
|
|
608
|
-
|
|
609
|
-
assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
|
|
610
|
-
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
611
|
-
predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
|
|
612
|
-
[predictions, neutralizers, sample_weights, targets]
|
|
613
|
-
)
|
|
627
|
+
predictions, targets = filter_sort_index(predictions, targets)
|
|
614
628
|
stake_weights = weight_normalize(stakes.fillna(0))
|
|
615
629
|
assert np.isclose(stake_weights.sum(), 1), "Stakes must sum to 1"
|
|
616
|
-
weights =
|
|
617
|
-
lambda s_prime: generate_neutralized_weights(
|
|
618
|
-
s_prime, neutralizers, sample_weights
|
|
619
|
-
)
|
|
620
|
-
)
|
|
630
|
+
weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
|
|
621
631
|
w = cast(np.ndarray, weights[stakes.index].values)
|
|
622
632
|
s = cast(np.ndarray, stake_weights.values)
|
|
623
633
|
t = cast(np.ndarray, targets.values)
|
|
@@ -1,16 +1,14 @@
|
|
|
1
1
|
from typing import Tuple, Optional
|
|
2
2
|
|
|
3
|
-
from numerai_tools.submissions import validate_headers_signals, validate_ids_signals
|
|
4
3
|
from numerai_tools.scoring import (
|
|
5
4
|
filter_sort_index,
|
|
6
5
|
filter_sort_top_bottom,
|
|
7
6
|
spearman_correlation,
|
|
8
|
-
tie_kept_rank,
|
|
9
|
-
tie_kept_rank__gaussianize__pow_1_5,
|
|
10
|
-
filter_sort_index_many,
|
|
11
7
|
generate_neutralized_weights,
|
|
12
|
-
|
|
13
|
-
|
|
8
|
+
)
|
|
9
|
+
from numerai_tools.submissions import (
|
|
10
|
+
clean_submission_signals,
|
|
11
|
+
remap_ids,
|
|
14
12
|
)
|
|
15
13
|
|
|
16
14
|
import pandas as pd
|
|
@@ -79,48 +77,6 @@ def turnover(
|
|
|
79
77
|
return turnover
|
|
80
78
|
|
|
81
79
|
|
|
82
|
-
def neutral_weight(
|
|
83
|
-
submission: pd.Series,
|
|
84
|
-
signal_col: str,
|
|
85
|
-
neutralizer: pd.DataFrame,
|
|
86
|
-
weight: pd.Series,
|
|
87
|
-
) -> pd.Series:
|
|
88
|
-
s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())
|
|
89
|
-
s_prime, neutralizer, weight = filter_sort_index_many(
|
|
90
|
-
[s_prime, neutralizer, weight]
|
|
91
|
-
)
|
|
92
|
-
neutral_weights = generate_neutralized_weights(
|
|
93
|
-
s_prime[signal_col], neutralizer, weight
|
|
94
|
-
)
|
|
95
|
-
neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
|
|
96
|
-
return neutral_weights.sort_index()
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def remap_ticker_col(
|
|
100
|
-
predictions: pd.DataFrame,
|
|
101
|
-
universe: pd.DataFrame,
|
|
102
|
-
ticker_col: str,
|
|
103
|
-
) -> pd.DataFrame:
|
|
104
|
-
return (
|
|
105
|
-
predictions.join(universe, how="right")
|
|
106
|
-
.reset_index()
|
|
107
|
-
.set_index(ticker_col)
|
|
108
|
-
.sort_index()
|
|
109
|
-
)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def rank_and_fill_signal(
|
|
113
|
-
universe: pd.DataFrame,
|
|
114
|
-
submission: pd.Series,
|
|
115
|
-
signal_col: str,
|
|
116
|
-
) -> pd.Series:
|
|
117
|
-
uni_joined_sub = universe.sort_index().join(
|
|
118
|
-
tie_kept_rank(submission.sort_index().to_frame())
|
|
119
|
-
)[[signal_col]]
|
|
120
|
-
filled_sub = uni_joined_sub.fillna(uni_joined_sub.median()).sort_index()
|
|
121
|
-
return filled_sub[signal_col]
|
|
122
|
-
|
|
123
|
-
|
|
124
80
|
def calculate_max_churn_and_turnover(
|
|
125
81
|
curr_sub: pd.DataFrame,
|
|
126
82
|
curr_neutralizer: pd.DataFrame,
|
|
@@ -141,7 +97,7 @@ def calculate_max_churn_and_turnover(
|
|
|
141
97
|
prev_week_subs -- a dictionary of datestamps to submissions
|
|
142
98
|
prev_neutralizers -- a dictionary of datestamps to neutralizers
|
|
143
99
|
prev_sample_weights -- a dictionary of datestamps to sample weights
|
|
144
|
-
universe -- the
|
|
100
|
+
universe -- the universe DataFrame for the current era
|
|
145
101
|
curr_signal_col -- the column name for signal in the current submission
|
|
146
102
|
curr_ticker_col -- the column name for tickers in the current submission
|
|
147
103
|
|
|
@@ -149,49 +105,48 @@ def calculate_max_churn_and_turnover(
|
|
|
149
105
|
prev_week_max_churn -- the maximum churn from previous submissions
|
|
150
106
|
prev_week_max_turnover -- the maximum turnover from previous submissions
|
|
151
107
|
"""
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
108
|
+
universe = universe.reset_index()
|
|
109
|
+
curr_sub_vector = clean_submission_signals(
|
|
110
|
+
universe=universe,
|
|
111
|
+
submission=curr_sub,
|
|
112
|
+
submission_id=curr_signal_col,
|
|
113
|
+
index_col=curr_ticker_col,
|
|
114
|
+
rank_and_fill=True,
|
|
156
115
|
)
|
|
157
116
|
churn_stats = []
|
|
158
117
|
turnover_stats = []
|
|
159
|
-
neutralized_weights =
|
|
160
|
-
curr_sub_vector,
|
|
118
|
+
neutralized_weights = generate_neutralized_weights(
|
|
119
|
+
curr_sub_vector.to_frame(), curr_neutralizer, curr_weight
|
|
161
120
|
)
|
|
162
121
|
for datestamp in prev_week_subs:
|
|
163
122
|
prev_sub = prev_week_subs[datestamp]
|
|
164
123
|
prev_neutralizer = prev_neutralizers[datestamp]
|
|
165
124
|
prev_weight = prev_sample_weights[datestamp]
|
|
166
|
-
|
|
167
|
-
prev_universe = universe.reset_index().set_index(prev_ticker_col)
|
|
168
|
-
filtered_prev_sub_df, _ = validate_ids_signals(
|
|
169
|
-
prev_universe.index.to_series(), prev_sub, prev_ticker_col
|
|
170
|
-
)
|
|
171
|
-
# in case the previous submission has a different ticker column,
|
|
172
|
-
# remap the ticker column of prev data to the current ticker column
|
|
173
|
-
filtered_prev_sub = remap_ticker_col(
|
|
174
|
-
filtered_prev_sub_df.set_index(prev_ticker_col),
|
|
175
|
-
universe=prev_universe,
|
|
176
|
-
ticker_col=curr_ticker_col,
|
|
177
|
-
)[curr_signal_col]
|
|
178
|
-
filtered_prev_sub = rank_and_fill_signal(
|
|
125
|
+
filtered_prev_sub = clean_submission_signals(
|
|
179
126
|
universe=universe,
|
|
180
|
-
submission=
|
|
181
|
-
|
|
127
|
+
submission=prev_sub,
|
|
128
|
+
submission_id=curr_signal_col,
|
|
129
|
+
index_col=curr_ticker_col,
|
|
130
|
+
rank_and_fill=True,
|
|
131
|
+
)
|
|
132
|
+
prev_neutralizer = (
|
|
133
|
+
remap_ids(
|
|
134
|
+
prev_neutralizer.reset_index(),
|
|
135
|
+
universe,
|
|
136
|
+
str(prev_neutralizer.index.name),
|
|
137
|
+
curr_ticker_col,
|
|
138
|
+
)
|
|
139
|
+
.set_index(curr_ticker_col)
|
|
140
|
+
.filter(like="neutralizer_")
|
|
182
141
|
)
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
universe
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
prev_weight
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
ticker_col=curr_ticker_col,
|
|
192
|
-
)[prev_weight.name]
|
|
193
|
-
prev_neutralized_weights = neutral_weight(
|
|
194
|
-
filtered_prev_sub, prev_signal_col, prev_neutralizer, prev_weight
|
|
142
|
+
prev_weight = remap_ids(
|
|
143
|
+
prev_weight.reset_index(),
|
|
144
|
+
universe,
|
|
145
|
+
str(prev_weight.index.name),
|
|
146
|
+
curr_ticker_col,
|
|
147
|
+
).set_index(curr_ticker_col)[prev_weight.name]
|
|
148
|
+
prev_neutralized_weights = generate_neutralized_weights(
|
|
149
|
+
filtered_prev_sub.to_frame(), prev_neutralizer, prev_weight
|
|
195
150
|
)
|
|
196
151
|
try:
|
|
197
152
|
churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
|
|
@@ -62,17 +62,6 @@ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
65
|
-
if "data_type" in submission.columns:
|
|
66
|
-
logger.warning(
|
|
67
|
-
"data_type column found in Signals submission. This is deprecated and will be removed in the future. "
|
|
68
|
-
"Please remove the data_type column from your Signals submission."
|
|
69
|
-
)
|
|
70
|
-
date_col = [
|
|
71
|
-
date_col
|
|
72
|
-
for date_col in SIGNALS_ALLOWED_DATE_COLS
|
|
73
|
-
if date_col in list(submission.columns)
|
|
74
|
-
]
|
|
75
|
-
submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
|
|
76
65
|
return _validate_headers(
|
|
77
66
|
SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
|
|
78
67
|
)
|
|
@@ -141,7 +130,7 @@ def _validate_ids(
|
|
|
141
130
|
# join on live_ids and ensure min tickers reached
|
|
142
131
|
assert (
|
|
143
132
|
len(live_sub) >= min_tickers
|
|
144
|
-
),
|
|
133
|
+
), "Not enough stocks submitted. Are you using the latest live ids or live universe?"
|
|
145
134
|
|
|
146
135
|
invalid_tickers = list(set(index_sub[id_col]).difference(set(live_sub[id_col])))
|
|
147
136
|
return live_sub, invalid_tickers
|
|
@@ -165,12 +154,33 @@ def validate_ids_crypto(
|
|
|
165
154
|
return _validate_ids(live_ids, submission, id_col, CRYPTO_MIN_TICKERS)
|
|
166
155
|
|
|
167
156
|
|
|
168
|
-
def
|
|
169
|
-
|
|
157
|
+
def remap_ids(
|
|
158
|
+
data: pd.DataFrame,
|
|
159
|
+
ticker_map: pd.Series | pd.DataFrame,
|
|
160
|
+
src_id_col: str,
|
|
161
|
+
dst_id_col: str,
|
|
162
|
+
) -> pd.DataFrame:
|
|
163
|
+
# first, index the universe and data on the source ids
|
|
164
|
+
indexed_map = ticker_map.reset_index().set_index(src_id_col)
|
|
165
|
+
indexed_data = data.set_index(src_id_col)
|
|
166
|
+
return (
|
|
167
|
+
# then, join the universe and data
|
|
168
|
+
indexed_map.join(indexed_data)
|
|
169
|
+
# get just the destination ids and prediction columns
|
|
170
|
+
.reset_index()[[dst_id_col, *indexed_data.columns]]
|
|
171
|
+
# finally, sort by the destination ticker column
|
|
172
|
+
.sort_values(dst_id_col)
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def clean_submission(
|
|
177
|
+
live_ids: pd.Series | pd.DataFrame,
|
|
170
178
|
predictions: pd.DataFrame,
|
|
179
|
+
name: str,
|
|
171
180
|
id_col: str,
|
|
172
181
|
rank_and_fill: bool,
|
|
173
|
-
|
|
182
|
+
tournament: int,
|
|
183
|
+
) -> pd.Series:
|
|
174
184
|
"""Prepare predictions for submission to Numerai.
|
|
175
185
|
Filters out ids not in live data, drops duplicates, sets ids as index,
|
|
176
186
|
then optionally ranks (keeping ties) and fills NaNs with 0.5.
|
|
@@ -182,28 +192,99 @@ def clean_predictions(
|
|
|
182
192
|
Arguments:
|
|
183
193
|
live_ids: pd.Series - the ids in the live data
|
|
184
194
|
predictions: pd.DataFrame - the predictions to clean
|
|
195
|
+
name: str - the name of the submission (used for renaming)
|
|
185
196
|
id_col: str - the column name of the ids
|
|
186
197
|
rank_and_fill: bool - whether to rank and fill NaNs with 0.5
|
|
187
198
|
left_join_ids: bool - whether to left join the predictions onto the ids
|
|
188
199
|
"""
|
|
189
200
|
assert len(live_ids) > 0, "live_ids must not be empty"
|
|
190
|
-
|
|
201
|
+
if isinstance(live_ids, pd.DataFrame):
|
|
202
|
+
assert live_ids.isna().sum().sum() == 0, "live_ids must not contain NaNs"
|
|
203
|
+
else:
|
|
204
|
+
assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
|
|
191
205
|
assert len(predictions) > 0, "predictions must not be empty"
|
|
192
206
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
207
|
+
header_fn = {
|
|
208
|
+
8: validate_headers_numerai,
|
|
209
|
+
11: validate_headers_signals,
|
|
210
|
+
12: validate_headers_crypto,
|
|
211
|
+
}
|
|
212
|
+
assert (
|
|
213
|
+
tournament in header_fn
|
|
214
|
+
), f"Unsupported tournament {tournament} for cleaning predictions"
|
|
215
|
+
ticker_col, signal_col = header_fn[tournament](predictions)
|
|
216
|
+
|
|
217
|
+
clean_preds = (
|
|
218
|
+
remap_ids(predictions, live_ids, ticker_col, id_col)
|
|
219
|
+
# drop NaNs and duplicates
|
|
220
|
+
.dropna(subset=[id_col])
|
|
201
221
|
.drop_duplicates(subset=id_col, keep="first")
|
|
202
|
-
# set ids as index
|
|
222
|
+
# set ids as index and sort
|
|
203
223
|
.set_index(id_col)
|
|
204
224
|
.sort_index()
|
|
205
|
-
|
|
225
|
+
# rename to given name
|
|
226
|
+
.rename(columns={signal_col: name})
|
|
227
|
+
)[name]
|
|
206
228
|
# rank and fill with 0.5
|
|
207
229
|
if rank_and_fill:
|
|
208
|
-
|
|
209
|
-
return
|
|
230
|
+
clean_preds = tie_kept_rank(clean_preds).fillna(0.5)
|
|
231
|
+
return clean_preds
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def clean_submission_numerai(
|
|
235
|
+
live_ids: pd.Series, submission: pd.DataFrame, user_id: str
|
|
236
|
+
) -> pd.Series:
|
|
237
|
+
return clean_submission(
|
|
238
|
+
live_ids=live_ids,
|
|
239
|
+
predictions=submission,
|
|
240
|
+
name=user_id,
|
|
241
|
+
id_col="id",
|
|
242
|
+
rank_and_fill=True,
|
|
243
|
+
tournament=8,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def clean_submission_signals(
|
|
248
|
+
universe: pd.DataFrame,
|
|
249
|
+
submission: pd.DataFrame,
|
|
250
|
+
submission_id: str,
|
|
251
|
+
index_col: str,
|
|
252
|
+
rank_and_fill: bool = True,
|
|
253
|
+
) -> pd.Series:
|
|
254
|
+
# drop data_type and date columns if they exist
|
|
255
|
+
if "data_type" in submission.columns:
|
|
256
|
+
logger.warning(
|
|
257
|
+
"data_type column found in Signals submission. This is deprecated and support will be removed in the future. "
|
|
258
|
+
"Please remove the data_type column from your Signals submission."
|
|
259
|
+
)
|
|
260
|
+
date_col = [
|
|
261
|
+
date_col
|
|
262
|
+
for date_col in SIGNALS_ALLOWED_DATE_COLS
|
|
263
|
+
if date_col in list(submission.columns)
|
|
264
|
+
]
|
|
265
|
+
submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
|
|
266
|
+
return clean_submission(
|
|
267
|
+
live_ids=universe,
|
|
268
|
+
predictions=submission,
|
|
269
|
+
name=submission_id,
|
|
270
|
+
id_col=index_col,
|
|
271
|
+
rank_and_fill=rank_and_fill,
|
|
272
|
+
tournament=11,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def clean_submission_crypto(
|
|
277
|
+
universe: pd.DataFrame,
|
|
278
|
+
submission: pd.DataFrame,
|
|
279
|
+
submission_id: str,
|
|
280
|
+
index_col: str,
|
|
281
|
+
rank_and_fill: bool = True,
|
|
282
|
+
):
|
|
283
|
+
return clean_submission(
|
|
284
|
+
live_ids=universe,
|
|
285
|
+
predictions=submission,
|
|
286
|
+
name=submission_id,
|
|
287
|
+
id_col=index_col,
|
|
288
|
+
rank_and_fill=rank_and_fill,
|
|
289
|
+
tournament=12,
|
|
290
|
+
)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "numerai-tools"
|
|
3
|
+
version = "0.5.0.dev3"
|
|
4
|
+
description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Numerai Engineering",email = "engineering@numer.ai"}
|
|
7
|
+
]
|
|
8
|
+
license = {text = "MIT"}
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 5 - Production/Stable",
|
|
13
|
+
"Environment :: Console",
|
|
14
|
+
"Intended Audience :: Science/Research",
|
|
15
|
+
"License :: OSI Approved :: MIT License",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
"Programming Language :: Python",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Scientific/Engineering",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
homepage = "https://numer.ai"
|
|
24
|
+
repository = "https://github.com/numerai/numerai-tools"
|
|
25
|
+
documentation = "https://docs.numer.ai/"
|
|
26
|
+
|
|
27
|
+
[tool.poetry]
|
|
28
|
+
packages = [
|
|
29
|
+
{include = "numerai_tools", from = "."},
|
|
30
|
+
]
|
|
31
|
+
include = [
|
|
32
|
+
{ path = "LICENSE", format = ["sdist", "wheel"] },
|
|
33
|
+
{ path = "README.md", format = ["sdist", "wheel"] },
|
|
34
|
+
{ path = "numerai_tools/py.typed", format = ["sdist", "wheel"] }
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[tool.poetry.dependencies]
|
|
38
|
+
pandas = "^2.2.2"
|
|
39
|
+
numpy = "^2.0.0"
|
|
40
|
+
scipy = "^1.13.0"
|
|
41
|
+
scikit-learn = "^1.5.0"
|
|
42
|
+
|
|
43
|
+
[tool.poetry.group.dev.dependencies]
|
|
44
|
+
pytest = "^8.3.4"
|
|
45
|
+
mypy = "^1.15.0"
|
|
46
|
+
ruff = "^0.5.4"
|
|
47
|
+
pandas-stubs = "^2.3.0.250703"
|
|
48
|
+
scipy-stubs = "^1.16.1.0"
|
|
49
|
+
|
|
50
|
+
[build-system]
|
|
51
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
52
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: numerai_tools
|
|
3
|
-
Version: 0.5.0.dev1
|
|
4
|
-
Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
5
|
-
Home-page: https://github.com/numerai/numerai-tools
|
|
6
|
-
Maintainer: Numerai
|
|
7
|
-
Maintainer-email: support@numer.ai
|
|
8
|
-
License: MIT License
|
|
9
|
-
Platform: OS Independent
|
|
10
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
-
Classifier: Environment :: Console
|
|
12
|
-
Classifier: Intended Audience :: Science/Research
|
|
13
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Classifier: Operating System :: OS Independent
|
|
15
|
-
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Topic :: Scientific/Engineering
|
|
18
|
-
Description-Content-Type: text/markdown
|
|
19
|
-
License-File: LICENSE
|
|
20
|
-
|
|
21
|
-
# numerai-tools
|
|
22
|
-
A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: numerai-tools
|
|
3
|
-
Version: 0.5.0.dev1
|
|
4
|
-
Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
5
|
-
Home-page: https://github.com/numerai/numerai-tools
|
|
6
|
-
Maintainer: Numerai
|
|
7
|
-
Maintainer-email: support@numer.ai
|
|
8
|
-
License: MIT License
|
|
9
|
-
Platform: OS Independent
|
|
10
|
-
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
-
Classifier: Environment :: Console
|
|
12
|
-
Classifier: Intended Audience :: Science/Research
|
|
13
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Classifier: Operating System :: OS Independent
|
|
15
|
-
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Topic :: Scientific/Engineering
|
|
18
|
-
Description-Content-Type: text/markdown
|
|
19
|
-
License-File: LICENSE
|
|
20
|
-
|
|
21
|
-
# numerai-tools
|
|
22
|
-
A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
README.md
|
|
3
|
-
setup.py
|
|
4
|
-
numerai_tools/__init__.py
|
|
5
|
-
numerai_tools/py.typed
|
|
6
|
-
numerai_tools/scoring.py
|
|
7
|
-
numerai_tools/signals.py
|
|
8
|
-
numerai_tools/submissions.py
|
|
9
|
-
numerai_tools.egg-info/PKG-INFO
|
|
10
|
-
numerai_tools.egg-info/SOURCES.txt
|
|
11
|
-
numerai_tools.egg-info/dependency_links.txt
|
|
12
|
-
numerai_tools.egg-info/requires.txt
|
|
13
|
-
numerai_tools.egg-info/top_level.txt
|
|
14
|
-
tests/test_scoring.py
|
|
15
|
-
tests/test_signals.py
|
|
16
|
-
tests/test_submissions.py
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
numerai_tools
|