numerai-tools 0.4.2.dev1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- numerai_tools-0.5.0/PKG-INFO +40 -0
- numerai_tools-0.5.0/README.md +15 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/scoring.py +111 -92
- numerai_tools-0.5.0/numerai_tools/signals.py +206 -0
- numerai_tools-0.5.0/numerai_tools/submissions.py +348 -0
- numerai_tools-0.5.0/pyproject.toml +52 -0
- numerai_tools-0.4.2.dev1/PKG-INFO +0 -22
- numerai_tools-0.4.2.dev1/README.md +0 -2
- numerai_tools-0.4.2.dev1/numerai_tools/signals.py +0 -72
- numerai_tools-0.4.2.dev1/numerai_tools/submissions.py +0 -191
- numerai_tools-0.4.2.dev1/numerai_tools.egg-info/PKG-INFO +0 -22
- numerai_tools-0.4.2.dev1/numerai_tools.egg-info/SOURCES.txt +0 -16
- numerai_tools-0.4.2.dev1/numerai_tools.egg-info/dependency_links.txt +0 -1
- numerai_tools-0.4.2.dev1/numerai_tools.egg-info/requires.txt +0 -4
- numerai_tools-0.4.2.dev1/numerai_tools.egg-info/top_level.txt +0 -1
- numerai_tools-0.4.2.dev1/setup.cfg +0 -4
- numerai_tools-0.4.2.dev1/setup.py +0 -47
- numerai_tools-0.4.2.dev1/tests/test_scoring.py +0 -346
- numerai_tools-0.4.2.dev1/tests/test_signals.py +0 -51
- numerai_tools-0.4.2.dev1/tests/test_submissions.py +0 -486
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/LICENSE +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/__init__.py +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/py.typed +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: numerai-tools
|
|
3
|
+
Version: 0.5.0
|
|
4
|
+
Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Numerai Engineering
|
|
7
|
+
Author-email: engineering@numer.ai
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
18
|
+
Requires-Dist: pandas (>=2.2.2,<3.0.0)
|
|
19
|
+
Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
|
|
20
|
+
Requires-Dist: scipy (>=1.13.0,<2.0.0)
|
|
21
|
+
Project-URL: Documentation, https://docs.numer.ai/
|
|
22
|
+
Project-URL: Homepage, https://numer.ai
|
|
23
|
+
Project-URL: Repository, https://github.com/numerai/numerai-tools
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# numerai-tools
|
|
27
|
+
A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
```
|
|
31
|
+
pip install numerai-tools
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Structure
|
|
35
|
+
|
|
36
|
+
- The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
|
|
37
|
+
|
|
38
|
+
- The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
|
|
39
|
+
|
|
40
|
+
- The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# numerai-tools
|
|
2
|
+
A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
3
|
+
|
|
4
|
+
## Installation
|
|
5
|
+
```
|
|
6
|
+
pip install numerai-tools
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Structure
|
|
10
|
+
|
|
11
|
+
- The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
|
|
12
|
+
|
|
13
|
+
- The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
|
|
14
|
+
|
|
15
|
+
- The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from typing import List, Tuple, Union, Optional, TypeVar
|
|
1
|
+
from typing import List, Literal, Tuple, Union, Optional, TypeVar, cast, Any
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from scipy import stats
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy import stats
|
|
6
6
|
from sklearn.preprocessing import OneHotEncoder # type: ignore
|
|
7
7
|
|
|
8
8
|
|
|
@@ -14,6 +14,7 @@ DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
|
|
|
14
14
|
|
|
15
15
|
S1 = TypeVar("S1", bound=Union[pd.DataFrame, pd.Series])
|
|
16
16
|
S2 = TypeVar("S2", bound=Union[pd.DataFrame, pd.Series])
|
|
17
|
+
RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def filter_sort_index(
|
|
@@ -43,12 +44,13 @@ def filter_sort_index(
|
|
|
43
44
|
"s2 does not have enough overlapping ids with s1,"
|
|
44
45
|
f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
|
|
45
46
|
)
|
|
46
|
-
return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
|
|
47
|
+
return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
def filter_sort_index_many(
|
|
50
|
-
inputs: List[
|
|
51
|
-
|
|
51
|
+
inputs: List[Any],
|
|
52
|
+
max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
|
|
53
|
+
) -> List[Any]:
|
|
52
54
|
"""Filters the indices of the given list of series to match each other,
|
|
53
55
|
then sorts the indices, then checks that we didn't filter too many indices
|
|
54
56
|
before returning the filtered and sorted series.
|
|
@@ -74,43 +76,72 @@ def filter_sort_index_many(
|
|
|
74
76
|
|
|
75
77
|
|
|
76
78
|
def filter_sort_top_bottom(
|
|
77
|
-
s: pd.Series, top_bottom: int
|
|
78
|
-
) ->
|
|
79
|
+
s: pd.Series, top_bottom: int
|
|
80
|
+
) -> Tuple[pd.Series, pd.Series]:
|
|
79
81
|
"""Filters the series according to the top n and bottom n values
|
|
80
|
-
then sorts the index and returns
|
|
82
|
+
then sorts the index and returns two filtered and sorted series
|
|
83
|
+
for the top and bottom values respectively.
|
|
81
84
|
|
|
82
85
|
Arguments:
|
|
83
86
|
s: pd.Series - the data to filter and sort
|
|
84
87
|
top_bottom: int - the number of top n and bottom n values to keep
|
|
85
88
|
|
|
86
89
|
Returns:
|
|
87
|
-
pd.Series - the filtered and sorted
|
|
90
|
+
Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
|
|
88
91
|
"""
|
|
89
92
|
tb_idx = np.argsort(s, kind="stable")
|
|
90
93
|
bot = s.iloc[tb_idx[:top_bottom]]
|
|
91
94
|
top = s.iloc[tb_idx[-top_bottom:]]
|
|
92
|
-
|
|
93
|
-
return pd.concat([top, bot]).sort_index()
|
|
94
|
-
else:
|
|
95
|
-
return top.sort_index(), bot.sort_index()
|
|
95
|
+
return top.sort_index(), bot.sort_index()
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
def
|
|
99
|
-
"""
|
|
98
|
+
def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
|
|
99
|
+
"""Similar to filter_sort_top_bottom, but concatenates the top and bottom series
|
|
100
|
+
into 1 series and then sorts the index.
|
|
100
101
|
|
|
101
102
|
Arguments:
|
|
102
|
-
|
|
103
|
+
s: pd.Series - the data to filter and sort
|
|
104
|
+
top_bottom: int - the number of top n and bottom n values to keep
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
pd.Series - the concatenated and sorted series of top and bottom values
|
|
108
|
+
"""
|
|
109
|
+
top, bot = filter_sort_top_bottom(s, top_bottom)
|
|
110
|
+
return pd.concat([top, bot]).sort_index()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
|
|
114
|
+
"""Percentile rank a pandas Series, centering values around 0.5.
|
|
115
|
+
|
|
116
|
+
Arguments:
|
|
117
|
+
s: pd.Series - the data to rank
|
|
103
118
|
method: str - the pandas ranking method to use, options:
|
|
104
119
|
'average' (default) - keeps ties
|
|
105
120
|
'first' - breaks ties by index
|
|
106
121
|
|
|
107
122
|
Returns:
|
|
108
|
-
pd.
|
|
123
|
+
pd.Series - the ranked Series
|
|
109
124
|
"""
|
|
110
|
-
assert np.array_equal(
|
|
111
|
-
return
|
|
112
|
-
|
|
113
|
-
|
|
125
|
+
assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
|
|
126
|
+
return (s.rank(method=method) - 0.5) / s.count()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
|
|
130
|
+
"""Percentile rank each columns or series, centering values around 0.5
|
|
131
|
+
|
|
132
|
+
Arguments:
|
|
133
|
+
s: pd.DataFrame | pd.Series - the data to rank
|
|
134
|
+
method: str - the pandas ranking method to use, options:
|
|
135
|
+
'average' (default) - keeps ties
|
|
136
|
+
'first' - breaks ties by index
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
pd.DataFrame | pd.Series - the ranked input data
|
|
140
|
+
"""
|
|
141
|
+
if isinstance(s, pd.Series):
|
|
142
|
+
return cast(S1, rank_series(s, method))
|
|
143
|
+
else:
|
|
144
|
+
return s.apply(lambda series: rank(series, method=method))
|
|
114
145
|
|
|
115
146
|
|
|
116
147
|
def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -118,9 +149,9 @@ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
118
149
|
return rank(df, "first")
|
|
119
150
|
|
|
120
151
|
|
|
121
|
-
def tie_kept_rank(
|
|
152
|
+
def tie_kept_rank(s: S1) -> S1:
|
|
122
153
|
"""Rank columns, but keep ties."""
|
|
123
|
-
return rank(
|
|
154
|
+
return cast(S1, rank(s, "average"))
|
|
124
155
|
|
|
125
156
|
|
|
126
157
|
def min_max_normalize(s: pd.Series) -> pd.Series:
|
|
@@ -133,14 +164,14 @@ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
133
164
|
return df / np.std(df, axis=0)
|
|
134
165
|
|
|
135
166
|
|
|
136
|
-
def weight_normalize(
|
|
137
|
-
"""Scale a
|
|
138
|
-
return
|
|
167
|
+
def weight_normalize(s: S1) -> S1:
|
|
168
|
+
"""Scale a input such that all columns have absolute value sum == 1."""
|
|
169
|
+
return cast(S1, s / s.abs().sum(axis=0))
|
|
139
170
|
|
|
140
171
|
|
|
141
|
-
def center(
|
|
142
|
-
"""Shift the
|
|
143
|
-
return
|
|
172
|
+
def center(s: S1) -> S1:
|
|
173
|
+
"""Shift the input such that all columns have mean == 0."""
|
|
174
|
+
return cast(S1, s - s.mean())
|
|
144
175
|
|
|
145
176
|
|
|
146
177
|
def standardize(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -179,7 +210,7 @@ def pearson_correlation(
|
|
|
179
210
|
target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
|
|
180
211
|
) -> float:
|
|
181
212
|
if top_bottom is not None and top_bottom > 0:
|
|
182
|
-
predictions =
|
|
213
|
+
predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
|
|
183
214
|
target, predictions = filter_sort_index(
|
|
184
215
|
target, predictions, (1 - top_bottom / len(target))
|
|
185
216
|
)
|
|
@@ -205,7 +236,7 @@ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
|
|
|
205
236
|
"""
|
|
206
237
|
assert not df.isna().any().any(), "Data contains NaNs"
|
|
207
238
|
assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
|
|
208
|
-
result = np.sign(df) * np.abs(df) ** p
|
|
239
|
+
result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
|
|
209
240
|
assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
|
|
210
241
|
return result
|
|
211
242
|
|
|
@@ -221,7 +252,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
221
252
|
pd.DataFrame - the gaussianized data
|
|
222
253
|
"""
|
|
223
254
|
assert np.array_equal(df.index.sort_values(), df.index)
|
|
224
|
-
return df.apply(lambda series: stats.norm.ppf(series))
|
|
255
|
+
return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
|
|
225
256
|
|
|
226
257
|
|
|
227
258
|
def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
|
|
@@ -303,7 +334,7 @@ def correlation_contribution(
|
|
|
303
334
|
m = gaussian(tie_kept_rank(meta_model.to_frame()))[meta_model.name].values
|
|
304
335
|
|
|
305
336
|
# orthogonalize predictions wrt meta model
|
|
306
|
-
neutral_preds = orthogonalize(p, m)
|
|
337
|
+
neutral_preds = orthogonalize(p, cast(np.ndarray, m))
|
|
307
338
|
|
|
308
339
|
# convert target to buckets [-2, -1, 0, 1, 2]
|
|
309
340
|
if (live_targets >= 0).all() and (live_targets <= 1).all():
|
|
@@ -314,9 +345,9 @@ def correlation_contribution(
|
|
|
314
345
|
# filter each column to its top and bottom n predictions
|
|
315
346
|
neutral_preds_df = pd.DataFrame(
|
|
316
347
|
neutral_preds, columns=predictions.columns, index=predictions.index
|
|
317
|
-
).apply(lambda p:
|
|
318
|
-
|
|
319
|
-
|
|
348
|
+
).apply(lambda p: filter_sort_top_bottom_concat(p, top_bottom))
|
|
349
|
+
mmc_matrix = (
|
|
350
|
+
# create a dataframe for targets to match the filtered predictions
|
|
320
351
|
neutral_preds_df.apply(
|
|
321
352
|
lambda p: filter_sort_index(
|
|
322
353
|
p,
|
|
@@ -326,19 +357,15 @@ def correlation_contribution(
|
|
|
326
357
|
)
|
|
327
358
|
.fillna(0)
|
|
328
359
|
.T.values
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
# multiply target and neutralized predictions
|
|
334
|
-
# this is equivalent to covariance b/c mean = 0
|
|
335
|
-
mmc = live_targets @ neutral_preds
|
|
336
|
-
if top_bottom is not None and top_bottom > 0:
|
|
360
|
+
# then fill NaNs with 0 so we don't get NaNs in the dot product
|
|
361
|
+
# and mutiply target w/ neutral preds to get MMC
|
|
362
|
+
) @ neutral_preds_df.fillna(0).values
|
|
337
363
|
# only the diagonal is the proper score
|
|
338
|
-
mmc = np.diag(
|
|
364
|
+
mmc = np.diag(mmc_matrix) / (top_bottom * 2)
|
|
339
365
|
else:
|
|
340
|
-
|
|
341
|
-
|
|
366
|
+
# multiply target and neutralized predictions
|
|
367
|
+
# this is equivalent to covariance b/c mean = 0
|
|
368
|
+
mmc = (live_targets @ neutral_preds) / len(live_targets)
|
|
342
369
|
return pd.Series(mmc, index=predictions.columns)
|
|
343
370
|
|
|
344
371
|
|
|
@@ -461,7 +488,7 @@ def numerai_corr(
|
|
|
461
488
|
Returns:
|
|
462
489
|
pd.Series - the resulting correlation scores for each column in predictions
|
|
463
490
|
"""
|
|
464
|
-
targets = targets
|
|
491
|
+
targets = center(targets)
|
|
465
492
|
targets, predictions = filter_sort_index(
|
|
466
493
|
targets, predictions, max_filtered_index_ratio
|
|
467
494
|
)
|
|
@@ -522,21 +549,33 @@ def max_feature_correlation(
|
|
|
522
549
|
feature_correlations = features.apply(
|
|
523
550
|
lambda f: pearson_correlation(f, s, top_bottom)
|
|
524
551
|
)
|
|
525
|
-
feature_correlations =
|
|
552
|
+
feature_correlations = feature_correlations.abs()
|
|
526
553
|
max_feature = feature_correlations.idxmax()
|
|
527
554
|
max_corr = feature_correlations[max_feature]
|
|
528
|
-
return max_feature, max_corr
|
|
555
|
+
return str(max_feature), max_corr
|
|
529
556
|
|
|
530
557
|
|
|
531
558
|
def generate_neutralized_weights(
|
|
532
|
-
predictions: pd.
|
|
559
|
+
predictions: pd.DataFrame,
|
|
533
560
|
neutralizers: pd.DataFrame,
|
|
534
561
|
sample_weights: pd.Series,
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
562
|
+
center_and_normalize: bool = False,
|
|
563
|
+
) -> pd.DataFrame:
|
|
564
|
+
assert not predictions.isna().any().any(), "Predictions contain NaNs"
|
|
565
|
+
assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
|
|
566
|
+
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
567
|
+
ranked_predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
|
|
568
|
+
ranked_predictions, neutralizers, sample_weights = filter_sort_index_many(
|
|
569
|
+
[ranked_predictions, neutralizers, sample_weights]
|
|
570
|
+
)
|
|
571
|
+
neutral_weights = ranked_predictions.apply(
|
|
572
|
+
lambda s_prime: (
|
|
573
|
+
s_prime - neutralizers @ (neutralizers.T @ (sample_weights * s_prime))
|
|
574
|
+
)
|
|
575
|
+
* sample_weights
|
|
538
576
|
)
|
|
539
|
-
|
|
577
|
+
if center_and_normalize:
|
|
578
|
+
neutral_weights = weight_normalize(center(neutral_weights))
|
|
540
579
|
return neutral_weights
|
|
541
580
|
|
|
542
581
|
|
|
@@ -557,18 +596,9 @@ def alpha(
|
|
|
557
596
|
sample_weights: pd.Series - the universe sampling weights
|
|
558
597
|
targets: pd.Series - the live targets to evaluate against
|
|
559
598
|
"""
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
|
|
564
|
-
[predictions, neutralizers, sample_weights, targets]
|
|
565
|
-
)
|
|
566
|
-
|
|
567
|
-
weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
|
|
568
|
-
lambda s_prime: generate_neutralized_weights(
|
|
569
|
-
s_prime, neutralizers, sample_weights
|
|
570
|
-
)
|
|
571
|
-
)
|
|
599
|
+
targets = center(targets)
|
|
600
|
+
predictions, targets = filter_sort_index(predictions, targets)
|
|
601
|
+
weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
|
|
572
602
|
alpha_scores = weights.apply(lambda w: w @ targets) / len(targets)
|
|
573
603
|
return alpha_scores
|
|
574
604
|
|
|
@@ -593,33 +623,22 @@ def meta_portfolio_contribution(
|
|
|
593
623
|
sample_weights: pd.Series - the universe sampling weights
|
|
594
624
|
targets: pd.Series - the live targets to evaluate against
|
|
595
625
|
"""
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
599
|
-
predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
|
|
600
|
-
[predictions, neutralizers, sample_weights, targets]
|
|
601
|
-
)
|
|
626
|
+
targets = center(targets)
|
|
627
|
+
predictions, targets = filter_sort_index(predictions, targets)
|
|
602
628
|
stake_weights = weight_normalize(stakes.fillna(0))
|
|
603
629
|
assert np.isclose(stake_weights.sum(), 1), "Stakes must sum to 1"
|
|
604
|
-
weights =
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
)
|
|
609
|
-
w = weights[stakes.index].values
|
|
610
|
-
s = stake_weights.values
|
|
611
|
-
t = targets.values
|
|
630
|
+
weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
|
|
631
|
+
w = cast(np.ndarray, weights[stakes.index].values)
|
|
632
|
+
s = cast(np.ndarray, stake_weights.values)
|
|
633
|
+
t = cast(np.ndarray, targets.values)
|
|
612
634
|
swp = w @ s
|
|
613
635
|
swp = swp - swp.mean()
|
|
614
|
-
|
|
636
|
+
l1_norm = np.sum(np.abs(swp))
|
|
637
|
+
l1_norm_squared = np.power(l1_norm, 2)
|
|
615
638
|
swp_sign = np.sign(swp)
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
)
|
|
621
|
-
zero_mean_jac_vec_prod = (
|
|
622
|
-
alpha_unnormalized_swp_grad - alpha_unnormalized_swp_grad.mean()
|
|
623
|
-
)
|
|
624
|
-
mpc = (w.T @ zero_mean_jac_vec_prod).squeeze()
|
|
639
|
+
swp_alpha = np.dot(swp, t)
|
|
640
|
+
directional_gradient = l1_norm * t - swp_sign * swp_alpha
|
|
641
|
+
jacobian_vector_product = directional_gradient.reshape(-1, 1) / l1_norm_squared
|
|
642
|
+
centered_jacobian = jacobian_vector_product - jacobian_vector_product.mean()
|
|
643
|
+
mpc = (w.T @ centered_jacobian).squeeze()
|
|
625
644
|
return pd.Series(mpc, index=stakes.index)
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
from typing import Tuple, Optional
|
|
2
|
+
|
|
3
|
+
from numerai_tools.scoring import (
|
|
4
|
+
filter_sort_index,
|
|
5
|
+
filter_sort_top_bottom,
|
|
6
|
+
spearman_correlation,
|
|
7
|
+
generate_neutralized_weights,
|
|
8
|
+
)
|
|
9
|
+
from numerai_tools.submissions import (
|
|
10
|
+
validate_submission_signals,
|
|
11
|
+
clean_submission,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def churn(
|
|
18
|
+
s1: pd.Series,
|
|
19
|
+
s2: pd.Series,
|
|
20
|
+
top_bottom: Optional[int] = None,
|
|
21
|
+
) -> float:
|
|
22
|
+
"""Calculate the churn between two series. Churn is the proportion of elements
|
|
23
|
+
that are different between the two series.
|
|
24
|
+
|
|
25
|
+
For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
|
|
26
|
+
If top_bottom is provided, the churn is calculated as the average of the % of
|
|
27
|
+
tickers that stay in the top and bottom predictions. This is only relevant when
|
|
28
|
+
the series are rank signals and not portfolio weights.
|
|
29
|
+
|
|
30
|
+
Arguments:
|
|
31
|
+
s1: pd.Series - the first series to compare
|
|
32
|
+
s2: pd.Series - the second series to compare
|
|
33
|
+
top_bottom: Optional[int] - the number of top and bottom predictions to use
|
|
34
|
+
when calculating the correlation. Results in
|
|
35
|
+
2*top_bottom predictions.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
float - the churn between the two series
|
|
39
|
+
"""
|
|
40
|
+
if top_bottom is not None and top_bottom > 0:
|
|
41
|
+
s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom)
|
|
42
|
+
s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom)
|
|
43
|
+
top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
|
|
44
|
+
bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
|
|
45
|
+
avg_overlap = (top_overlap + bot_overlap) / 2
|
|
46
|
+
return 1 - avg_overlap
|
|
47
|
+
|
|
48
|
+
s1, s2 = filter_sort_index(s1, s2)
|
|
49
|
+
assert s1.std() > 0, "s1 must have non-zero standard deviation"
|
|
50
|
+
assert s2.std() > 0, "s2 must have non-zero standard deviation"
|
|
51
|
+
return 1 - spearman_correlation(s1, s2)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def turnover(
|
|
55
|
+
s1: pd.Series,
|
|
56
|
+
s2: pd.Series,
|
|
57
|
+
):
|
|
58
|
+
"""Calculate the turnover between two series. Turnover is the total change in weights between
|
|
59
|
+
the two series divided by 2.
|
|
60
|
+
|
|
61
|
+
For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
|
|
62
|
+
and calculate turnover as the absolute total difference between the two series divided by 2.
|
|
63
|
+
This is only relevant when the series are portfolio weights and not rank signals.
|
|
64
|
+
|
|
65
|
+
Arguments:
|
|
66
|
+
s1: pd.Series - the first series to compare
|
|
67
|
+
s2: pd.Series - the second series to compare
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
float - the turnover between the two series
|
|
71
|
+
"""
|
|
72
|
+
s1, s2 = filter_sort_index(s1, s2)
|
|
73
|
+
turnover = (s1 - s2).abs().sum() / 2
|
|
74
|
+
return turnover
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def calculate_max_churn_and_turnover(
|
|
78
|
+
curr_sub: pd.Series,
|
|
79
|
+
curr_neutralizer: pd.DataFrame,
|
|
80
|
+
curr_sample_weight: pd.Series,
|
|
81
|
+
prev_subs: dict[str, pd.Series],
|
|
82
|
+
prev_neutralizers: dict[str, pd.DataFrame],
|
|
83
|
+
prev_sample_weights: dict[str, pd.Series],
|
|
84
|
+
) -> Tuple[float, float]:
|
|
85
|
+
"""Calculate the maximum churn and turnover of the current submission with respect to previous submissions.
|
|
86
|
+
This function iterates over previous submissions and calculates churn and turnover for each submission
|
|
87
|
+
against the current submission. It expects the following:
|
|
88
|
+
|
|
89
|
+
- all submissions, neutralizers, and sample weights are indexed on the same type of tickers/IDs
|
|
90
|
+
(e.g. all numerai_ticker, or all composite_figi, or all etc.)
|
|
91
|
+
|
|
92
|
+
- neutralizers and sample weights cover the full universe of their respective eras. This means you
|
|
93
|
+
should avoid removing rows from neutralizers or sample weights before passing them to this function.
|
|
94
|
+
|
|
95
|
+
In a live submission environment your submissions are joined on their respective full universes, ranked,
|
|
96
|
+
and then any NaNs are filled with 0.5 before calculating churn and turnover. So, if you provide filtered
|
|
97
|
+
neutralizers or sample weights, your locally calculated churn and turnover may not match the live value.
|
|
98
|
+
|
|
99
|
+
Arguments:
|
|
100
|
+
curr_sub: pd.Series - current-era submission indexed on tickers/ids
|
|
101
|
+
|
|
102
|
+
curr_neutralizer: pd.DataFrame
|
|
103
|
+
- current-era neutralizers indexed on the same type of tickers/ids.
|
|
104
|
+
We expect these to cover the full universe for the current era.
|
|
105
|
+
|
|
106
|
+
curr_sample_weight: pd.Series
|
|
107
|
+
- current-era sample weights indexed on the same type of tickers/ids.
|
|
108
|
+
We expect these to cover the full universe for the current era.
|
|
109
|
+
|
|
110
|
+
prev_subs: dict[str, pd.Series]
|
|
111
|
+
- a dictionary mapping datestamps to submissions, where each submission is a
|
|
112
|
+
Series indexed on the same type of tickers/ids as the current
|
|
113
|
+
submission. To calculate churn and turnover for a live submission,
|
|
114
|
+
use the most recent 5 submissions. For diagnostics, just provide the
|
|
115
|
+
last 1 era.
|
|
116
|
+
|
|
117
|
+
prev_neutralizers: dict[str, pd.DataFrame]
|
|
118
|
+
- a dictionary mapping datestamps to neutralizers DataFrames where each neutralizers
|
|
119
|
+
DataFrame is indexed on the same type of tickers/ids as the current submission.
|
|
120
|
+
We expect each of these to cover the full universe of their respective eras.
|
|
121
|
+
|
|
122
|
+
prev_sample_weights: dict[str, pd.Series]
|
|
123
|
+
- a dictionary mapping datestamps to sample weights where each sample weights
|
|
124
|
+
Series is indexed on the same type of tickers/ids as the current submission.
|
|
125
|
+
We expect each of these to cover the full universe of their respective eras.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
prev_week_max_churn -- the maximum churn from previous submissions
|
|
129
|
+
prev_week_max_turnover -- the maximum turnover from previous submissions
|
|
130
|
+
"""
|
|
131
|
+
(
|
|
132
|
+
curr_ticker_col,
|
|
133
|
+
curr_signal_col,
|
|
134
|
+
_,
|
|
135
|
+
curr_sub_df,
|
|
136
|
+
_,
|
|
137
|
+
) = validate_submission_signals(
|
|
138
|
+
universe=curr_sample_weight.index.to_frame(),
|
|
139
|
+
submission=curr_sub.reset_index(),
|
|
140
|
+
)
|
|
141
|
+
curr_sub = clean_submission(
|
|
142
|
+
universe=curr_sample_weight.index.to_frame(),
|
|
143
|
+
submission=curr_sub_df,
|
|
144
|
+
src_id_col=curr_ticker_col,
|
|
145
|
+
src_signal_col=curr_signal_col,
|
|
146
|
+
rank_and_fill=True,
|
|
147
|
+
)
|
|
148
|
+
churn_stats = []
|
|
149
|
+
turnover_stats = []
|
|
150
|
+
neutralized_weights = generate_neutralized_weights(
|
|
151
|
+
curr_sub.to_frame(),
|
|
152
|
+
curr_neutralizer,
|
|
153
|
+
curr_sample_weight,
|
|
154
|
+
center_and_normalize=True,
|
|
155
|
+
)[curr_sub.name]
|
|
156
|
+
for datestamp in prev_subs:
|
|
157
|
+
prev_sub = prev_subs[datestamp]
|
|
158
|
+
prev_neutralizer = prev_neutralizers[datestamp]
|
|
159
|
+
prev_sample_weight = prev_sample_weights[datestamp]
|
|
160
|
+
(
|
|
161
|
+
prev_ticker_col,
|
|
162
|
+
prev_signal_col,
|
|
163
|
+
_,
|
|
164
|
+
prev_sub_df,
|
|
165
|
+
_,
|
|
166
|
+
) = validate_submission_signals(
|
|
167
|
+
universe=prev_sample_weight.index.to_frame(),
|
|
168
|
+
submission=prev_sub.reset_index(),
|
|
169
|
+
)
|
|
170
|
+
prev_sub = clean_submission(
|
|
171
|
+
universe=prev_sample_weight.index.to_frame(),
|
|
172
|
+
submission=prev_sub_df,
|
|
173
|
+
src_id_col=prev_ticker_col,
|
|
174
|
+
src_signal_col=prev_signal_col,
|
|
175
|
+
dst_id_col=curr_ticker_col,
|
|
176
|
+
dst_signal_col=curr_signal_col,
|
|
177
|
+
rank_and_fill=True,
|
|
178
|
+
)
|
|
179
|
+
prev_neutralized_weights = generate_neutralized_weights(
|
|
180
|
+
prev_sub.to_frame(),
|
|
181
|
+
prev_neutralizer,
|
|
182
|
+
prev_sample_weight,
|
|
183
|
+
center_and_normalize=True,
|
|
184
|
+
)[prev_sub.name]
|
|
185
|
+
try:
|
|
186
|
+
churn_val = abs(churn(curr_sub, prev_sub))
|
|
187
|
+
except AssertionError as e:
|
|
188
|
+
if "does not have enough overlapping ids" in str(e):
|
|
189
|
+
continue
|
|
190
|
+
try:
|
|
191
|
+
turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
|
|
192
|
+
except AssertionError as e:
|
|
193
|
+
if "does not have enough overlapping ids" in str(e):
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
churn_stats.append(churn_val)
|
|
197
|
+
turnover_stats.append(turnover_val)
|
|
198
|
+
if len(churn_stats) == 0:
|
|
199
|
+
prev_week_max_churn = 1.0
|
|
200
|
+
else:
|
|
201
|
+
prev_week_max_churn = max(churn_stats)
|
|
202
|
+
if len(turnover_stats) == 0:
|
|
203
|
+
prev_week_max_turnover = 1.0
|
|
204
|
+
else:
|
|
205
|
+
prev_week_max_turnover = max(turnover_stats)
|
|
206
|
+
return prev_week_max_churn, prev_week_max_turnover
|