numerai-tools 0.4.2.dev1__tar.gz → 0.5.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/PKG-INFO +1 -1
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/scoring.py +15 -15
- numerai_tools-0.5.0.dev0/numerai_tools/signals.py +215 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/submissions.py +23 -4
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/PKG-INFO +1 -1
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/setup.py +1 -1
- numerai_tools-0.5.0.dev0/tests/test_signals.py +139 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/tests/test_submissions.py +12 -0
- numerai_tools-0.4.2.dev1/numerai_tools/signals.py +0 -72
- numerai_tools-0.4.2.dev1/tests/test_signals.py +0 -51
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/LICENSE +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/README.md +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/__init__.py +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools/py.typed +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/SOURCES.txt +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/dependency_links.txt +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/requires.txt +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/top_level.txt +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/setup.cfg +0 -0
- {numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/tests/test_scoring.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: numerai_tools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0.dev0
|
|
4
4
|
Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
5
5
|
Home-page: https://github.com/numerai/numerai-tools
|
|
6
6
|
Maintainer: Numerai
|
|
@@ -47,8 +47,9 @@ def filter_sort_index(
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
def filter_sort_index_many(
|
|
50
|
-
inputs: List[
|
|
51
|
-
|
|
50
|
+
inputs: List[pd.DataFrame],
|
|
51
|
+
max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
|
|
52
|
+
) -> List[pd.DataFrame]:
|
|
52
53
|
"""Filters the indices of the given list of series to match each other,
|
|
53
54
|
then sorts the indices, then checks that we didn't filter too many indices
|
|
54
55
|
before returning the filtered and sorted series.
|
|
@@ -461,7 +462,7 @@ def numerai_corr(
|
|
|
461
462
|
Returns:
|
|
462
463
|
pd.Series - the resulting correlation scores for each column in predictions
|
|
463
464
|
"""
|
|
464
|
-
targets = targets
|
|
465
|
+
targets = center(targets)
|
|
465
466
|
targets, predictions = filter_sort_index(
|
|
466
467
|
targets, predictions, max_filtered_index_ratio
|
|
467
468
|
)
|
|
@@ -557,14 +558,15 @@ def alpha(
|
|
|
557
558
|
sample_weights: pd.Series - the universe sampling weights
|
|
558
559
|
targets: pd.Series - the live targets to evaluate against
|
|
559
560
|
"""
|
|
561
|
+
targets = center(targets)
|
|
560
562
|
assert not predictions.isna().any().any(), "Predictions contain NaNs"
|
|
561
563
|
assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
|
|
562
564
|
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
563
565
|
predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
|
|
564
566
|
[predictions, neutralizers, sample_weights, targets]
|
|
565
567
|
)
|
|
566
|
-
|
|
567
|
-
weights =
|
|
568
|
+
ranked_preds = tie_kept_rank__gaussianize__pow_1_5(predictions)
|
|
569
|
+
weights = ranked_preds.apply(
|
|
568
570
|
lambda s_prime: generate_neutralized_weights(
|
|
569
571
|
s_prime, neutralizers, sample_weights
|
|
570
572
|
)
|
|
@@ -593,6 +595,7 @@ def meta_portfolio_contribution(
|
|
|
593
595
|
sample_weights: pd.Series - the universe sampling weights
|
|
594
596
|
targets: pd.Series - the live targets to evaluate against
|
|
595
597
|
"""
|
|
598
|
+
targets = center(targets)
|
|
596
599
|
assert not predictions.isna().any().any(), "Predictions contain NaNs"
|
|
597
600
|
assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
|
|
598
601
|
assert not sample_weights.isna().any(), "Weights contain NaNs"
|
|
@@ -611,15 +614,12 @@ def meta_portfolio_contribution(
|
|
|
611
614
|
t = targets.values
|
|
612
615
|
swp = w @ s
|
|
613
616
|
swp = swp - swp.mean()
|
|
614
|
-
|
|
617
|
+
l1_norm = np.sum(np.abs(swp))
|
|
618
|
+
l1_norm_squared = np.power(l1_norm, 2)
|
|
615
619
|
swp_sign = np.sign(swp)
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
)
|
|
621
|
-
zero_mean_jac_vec_prod = (
|
|
622
|
-
alpha_unnormalized_swp_grad - alpha_unnormalized_swp_grad.mean()
|
|
623
|
-
)
|
|
624
|
-
mpc = (w.T @ zero_mean_jac_vec_prod).squeeze()
|
|
620
|
+
swp_alpha = np.dot(swp, t)
|
|
621
|
+
directional_gradient = l1_norm * t - swp_sign * swp_alpha
|
|
622
|
+
jacobian_vector_product = directional_gradient.reshape(-1, 1) / l1_norm_squared
|
|
623
|
+
centered_jacobian = jacobian_vector_product - jacobian_vector_product.mean()
|
|
624
|
+
mpc = (w.T @ centered_jacobian).squeeze()
|
|
625
625
|
return pd.Series(mpc, index=stakes.index)
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
from typing import Tuple, Optional
|
|
2
|
+
|
|
3
|
+
from numerai_tools.submissions import validate_headers_signals, validate_ids_signals
|
|
4
|
+
from numerai_tools.scoring import (
|
|
5
|
+
filter_sort_index,
|
|
6
|
+
filter_sort_top_bottom,
|
|
7
|
+
spearman_correlation,
|
|
8
|
+
tie_kept_rank,
|
|
9
|
+
tie_kept_rank__gaussianize__pow_1_5,
|
|
10
|
+
filter_sort_index_many,
|
|
11
|
+
generate_neutralized_weights,
|
|
12
|
+
weight_normalize,
|
|
13
|
+
center,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def churn(
|
|
20
|
+
s1: pd.Series,
|
|
21
|
+
s2: pd.Series,
|
|
22
|
+
top_bottom: Optional[int] = None,
|
|
23
|
+
) -> float:
|
|
24
|
+
"""Calculate the churn between two series. Churn is the proportion of elements
|
|
25
|
+
that are different between the two series.
|
|
26
|
+
|
|
27
|
+
For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
|
|
28
|
+
If top_bottom is provided, the churn is calculated as the average of the % of
|
|
29
|
+
tickers that stay in the top and bottom predictions. This is only relevant when
|
|
30
|
+
the series are rank signals and not portfolio weights.
|
|
31
|
+
|
|
32
|
+
Arguments:
|
|
33
|
+
s1: pd.Series - the first series to compare
|
|
34
|
+
s2: pd.Series - the second series to compare
|
|
35
|
+
top_bottom: Optional[int] - the number of top and bottom predictions to use
|
|
36
|
+
when calculating the correlation. Results in
|
|
37
|
+
2*top_bottom predictions.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
float - the churn between the two series
|
|
41
|
+
"""
|
|
42
|
+
if top_bottom is not None and top_bottom > 0:
|
|
43
|
+
s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
|
|
44
|
+
s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
|
|
45
|
+
top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
|
|
46
|
+
bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
|
|
47
|
+
avg_overlap = (top_overlap + bot_overlap) / 2
|
|
48
|
+
return 1 - avg_overlap
|
|
49
|
+
|
|
50
|
+
s1, s2 = filter_sort_index(s1, s2)
|
|
51
|
+
assert s1.std() > 0, "s1 must have non-zero standard deviation"
|
|
52
|
+
assert s2.std() > 0, "s2 must have non-zero standard deviation"
|
|
53
|
+
return 1 - spearman_correlation(s1, s2)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def turnover(
|
|
57
|
+
s1: pd.Series,
|
|
58
|
+
s2: pd.Series,
|
|
59
|
+
):
|
|
60
|
+
"""Calculate the turnover between two series. Turnover is the total change in weights between
|
|
61
|
+
the two series divided by 2.
|
|
62
|
+
|
|
63
|
+
For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
|
|
64
|
+
and calculate turnover as the absolute total difference between the two series divided by 2.
|
|
65
|
+
This is only relevant when the series are portfolio weights and not rank signals.
|
|
66
|
+
|
|
67
|
+
Arguments:
|
|
68
|
+
s1: pd.Series - the first series to compare
|
|
69
|
+
s2: pd.Series - the second series to compare
|
|
70
|
+
top_bottom: Optional[int] - the number of top and bottom predictions to use
|
|
71
|
+
when calculating the correlation. Results in
|
|
72
|
+
2*top_bottom predictions.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
float - the turnover between the two series
|
|
76
|
+
"""
|
|
77
|
+
s1, s2 = filter_sort_index(s1, s2)
|
|
78
|
+
turnover = (s1 - s2).abs().sum() / 2
|
|
79
|
+
return turnover
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def neutral_weight(
|
|
83
|
+
submission: pd.Series,
|
|
84
|
+
signal_col: str,
|
|
85
|
+
neutralizer: pd.DataFrame,
|
|
86
|
+
weight: pd.Series,
|
|
87
|
+
) -> pd.Series:
|
|
88
|
+
s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())[signal_col]
|
|
89
|
+
s_prime, neutralizer, weight = filter_sort_index_many( # type: ignore
|
|
90
|
+
[s_prime, neutralizer, weight]
|
|
91
|
+
)
|
|
92
|
+
neutral_weights = generate_neutralized_weights(s_prime, neutralizer, weight)
|
|
93
|
+
neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
|
|
94
|
+
return neutral_weights.sort_index()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def remap_ticker_col(
|
|
98
|
+
predictions: pd.DataFrame,
|
|
99
|
+
universe: pd.DataFrame,
|
|
100
|
+
ticker_col: str,
|
|
101
|
+
) -> pd.DataFrame:
|
|
102
|
+
return (
|
|
103
|
+
predictions.join(universe, how="right")
|
|
104
|
+
.reset_index()
|
|
105
|
+
.set_index(ticker_col)
|
|
106
|
+
.sort_index()
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def rank_and_fill_signal(
|
|
111
|
+
universe: pd.DataFrame,
|
|
112
|
+
submission: pd.Series,
|
|
113
|
+
signal_col: str,
|
|
114
|
+
) -> pd.Series:
|
|
115
|
+
uni_joined_sub = universe.sort_index().join(
|
|
116
|
+
tie_kept_rank(submission.sort_index().to_frame())
|
|
117
|
+
)[[signal_col]]
|
|
118
|
+
filled_sub = uni_joined_sub.fillna(uni_joined_sub.median()).sort_index()
|
|
119
|
+
return filled_sub[signal_col]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def calculate_max_churn_and_turnover(
|
|
123
|
+
curr_sub: pd.DataFrame,
|
|
124
|
+
curr_neutralizer: pd.DataFrame,
|
|
125
|
+
curr_weight: pd.Series,
|
|
126
|
+
prev_week_subs: dict[str, pd.DataFrame],
|
|
127
|
+
prev_neutralizers: dict[str, pd.DataFrame],
|
|
128
|
+
prev_sample_weights: dict[str, pd.Series],
|
|
129
|
+
universe: pd.DataFrame,
|
|
130
|
+
curr_signal_col: str,
|
|
131
|
+
curr_ticker_col: str,
|
|
132
|
+
) -> Tuple[float, float]:
|
|
133
|
+
"""Calculate the maximum churn and turnover with respect to previous submissions.
|
|
134
|
+
|
|
135
|
+
Arguments:
|
|
136
|
+
curr_sub -- the current submission
|
|
137
|
+
curr_neutralizer -- the neutralizer DataFrame for the current submission
|
|
138
|
+
curr_weight -- the sample weights Series for the current submission
|
|
139
|
+
prev_week_subs -- a dictionary of datestamps to submissions
|
|
140
|
+
prev_neutralizers -- a dictionary of datestamps to neutralizers
|
|
141
|
+
prev_sample_weights -- a dictionary of datestamps to sample weights
|
|
142
|
+
universe -- the internal universe DataFrame
|
|
143
|
+
curr_signal_col -- the column name for signal in the current submission
|
|
144
|
+
curr_ticker_col -- the column name for tickers in the current submission
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
prev_week_max_churn -- the maximum churn from previous submissions
|
|
148
|
+
prev_week_max_turnover -- the maximum turnover from previous submissions
|
|
149
|
+
"""
|
|
150
|
+
curr_sub_vector: pd.Series = rank_and_fill_signal(
|
|
151
|
+
universe,
|
|
152
|
+
curr_sub.reset_index().set_index(curr_ticker_col).sort_index()[curr_signal_col],
|
|
153
|
+
curr_signal_col,
|
|
154
|
+
)
|
|
155
|
+
churn_stats = []
|
|
156
|
+
turnover_stats = []
|
|
157
|
+
neutralized_weights = neutral_weight(
|
|
158
|
+
curr_sub_vector, curr_signal_col, curr_neutralizer, curr_weight
|
|
159
|
+
)
|
|
160
|
+
for datestamp in prev_week_subs:
|
|
161
|
+
prev_sub = prev_week_subs[datestamp]
|
|
162
|
+
prev_neutralizer = prev_neutralizers[datestamp]
|
|
163
|
+
prev_weight = prev_sample_weights[datestamp]
|
|
164
|
+
prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub) # type: ignore
|
|
165
|
+
prev_universe = universe.reset_index().set_index(prev_ticker_col)
|
|
166
|
+
filtered_prev_sub_df, _ = validate_ids_signals(
|
|
167
|
+
prev_universe.index, prev_sub, prev_ticker_col
|
|
168
|
+
)
|
|
169
|
+
# in case the previous submission has a different ticker column,
|
|
170
|
+
# remap the ticker column of prev data to the current ticker column
|
|
171
|
+
filtered_prev_sub = remap_ticker_col(
|
|
172
|
+
filtered_prev_sub_df.set_index(prev_ticker_col),
|
|
173
|
+
universe=prev_universe,
|
|
174
|
+
ticker_col=curr_ticker_col,
|
|
175
|
+
)[curr_signal_col]
|
|
176
|
+
filtered_prev_sub = rank_and_fill_signal(
|
|
177
|
+
universe=universe,
|
|
178
|
+
submission=filtered_prev_sub,
|
|
179
|
+
signal_col=curr_signal_col,
|
|
180
|
+
)
|
|
181
|
+
prev_neutralizer = remap_ticker_col(
|
|
182
|
+
prev_neutralizer,
|
|
183
|
+
universe=prev_universe,
|
|
184
|
+
ticker_col=curr_ticker_col,
|
|
185
|
+
).filter(like="neutralizer_")
|
|
186
|
+
prev_weight = remap_ticker_col(
|
|
187
|
+
prev_weight.to_frame(),
|
|
188
|
+
universe=prev_universe,
|
|
189
|
+
ticker_col=curr_ticker_col,
|
|
190
|
+
)[prev_weight.name]
|
|
191
|
+
prev_neutralized_weights = neutral_weight(
|
|
192
|
+
filtered_prev_sub, prev_signal_col, prev_neutralizer, prev_weight
|
|
193
|
+
)
|
|
194
|
+
try:
|
|
195
|
+
churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
|
|
196
|
+
except AssertionError as e:
|
|
197
|
+
if "does not have enough overlapping ids" in str(e):
|
|
198
|
+
continue
|
|
199
|
+
try:
|
|
200
|
+
turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
|
|
201
|
+
except AssertionError as e:
|
|
202
|
+
if "does not have enough overlapping ids" in str(e):
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
churn_stats.append(churn_val)
|
|
206
|
+
turnover_stats.append(turnover_val)
|
|
207
|
+
if len(churn_stats) == 0:
|
|
208
|
+
prev_week_max_churn = 1.0
|
|
209
|
+
else:
|
|
210
|
+
prev_week_max_churn = max(churn_stats)
|
|
211
|
+
if len(turnover_stats) == 0:
|
|
212
|
+
prev_week_max_turnover = 1.0
|
|
213
|
+
else:
|
|
214
|
+
prev_week_max_turnover = max(turnover_stats)
|
|
215
|
+
return prev_week_max_churn, prev_week_max_turnover
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from numerai_tools.scoring import tie_kept_rank
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Tuple, List
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
@@ -16,12 +17,15 @@ SIGNALS_ALLOWED_ID_COLS = [
|
|
|
16
17
|
"numerai_ticker",
|
|
17
18
|
]
|
|
18
19
|
SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
|
|
20
|
+
SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
|
|
19
21
|
SIGNALS_MIN_TICKERS = 100
|
|
20
22
|
|
|
21
23
|
CRYPTO_ALLOWED_ID_COLS = ["symbol"]
|
|
22
24
|
CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
|
|
23
25
|
CRYPTO_MIN_TICKERS = 100
|
|
24
26
|
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
25
29
|
|
|
26
30
|
def _validate_headers(
|
|
27
31
|
expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
|
|
@@ -58,6 +62,17 @@ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
|
58
62
|
|
|
59
63
|
|
|
60
64
|
def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
|
|
65
|
+
if "data_type" in submission.columns:
|
|
66
|
+
logger.warning(
|
|
67
|
+
"data_type column found in Signals submission. This is deprecated and will be removed in the future. "
|
|
68
|
+
"Please remove the data_type column from your Signals submission."
|
|
69
|
+
)
|
|
70
|
+
date_col = [
|
|
71
|
+
date_col
|
|
72
|
+
for date_col in SIGNALS_ALLOWED_DATE_COLS
|
|
73
|
+
if date_col in list(submission.columns)
|
|
74
|
+
]
|
|
75
|
+
submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
|
|
61
76
|
return _validate_headers(
|
|
62
77
|
SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
|
|
63
78
|
)
|
|
@@ -155,6 +170,7 @@ def clean_predictions(
|
|
|
155
170
|
predictions: pd.DataFrame,
|
|
156
171
|
id_col: str,
|
|
157
172
|
rank_and_fill: bool,
|
|
173
|
+
left_join_on_ids: bool = False,
|
|
158
174
|
) -> pd.Series:
|
|
159
175
|
"""Prepare predictions for submission to Numerai.
|
|
160
176
|
Filters out ids not in live data, drops duplicates, sets ids as index,
|
|
@@ -169,6 +185,7 @@ def clean_predictions(
|
|
|
169
185
|
predictions: pd.DataFrame - the predictions to clean
|
|
170
186
|
id_col: str - the column name of the ids
|
|
171
187
|
rank_and_fill: bool - whether to rank and fill NaNs with 0.5
|
|
188
|
+
left_join_ids: bool - whether to left join the predictions onto the ids
|
|
172
189
|
"""
|
|
173
190
|
assert len(live_ids) > 0, "live_ids must not be empty"
|
|
174
191
|
assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
|
|
@@ -177,13 +194,15 @@ def clean_predictions(
|
|
|
177
194
|
# drop null indices
|
|
178
195
|
predictions = predictions[~predictions[id_col].isna()]
|
|
179
196
|
predictions = (
|
|
180
|
-
predictions
|
|
181
|
-
|
|
182
|
-
|
|
197
|
+
predictions[
|
|
198
|
+
# filter out ids not in live data
|
|
199
|
+
predictions[id_col].isin(live_ids)
|
|
200
|
+
]
|
|
183
201
|
# drop duplicate ids (keep first)
|
|
184
202
|
.drop_duplicates(subset=id_col, keep="first")
|
|
185
203
|
# set ids as index
|
|
186
|
-
.set_index(id_col)
|
|
204
|
+
.set_index(id_col)
|
|
205
|
+
.sort_index()
|
|
187
206
|
)
|
|
188
207
|
# rank and fill with 0.5
|
|
189
208
|
if rank_and_fill:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: numerai-tools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0.dev0
|
|
4
4
|
Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
|
|
5
5
|
Home-page: https://github.com/numerai/numerai-tools
|
|
6
6
|
Maintainer: Numerai
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd # type: ignore
|
|
5
|
+
|
|
6
|
+
from numerai_tools.signals import (
|
|
7
|
+
churn,
|
|
8
|
+
turnover,
|
|
9
|
+
calculate_max_churn_and_turnover,
|
|
10
|
+
)
|
|
11
|
+
from .util import (
|
|
12
|
+
generate_fake_universe,
|
|
13
|
+
generate_new_submission,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestSignals(unittest.TestCase):
|
|
18
|
+
def setUp(self):
|
|
19
|
+
self.up = pd.Series(list(range(5))).rename("up")
|
|
20
|
+
self.down = pd.Series(list(reversed(range(5)))).rename("down")
|
|
21
|
+
self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
|
|
22
|
+
self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
|
|
23
|
+
self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
|
|
24
|
+
|
|
25
|
+
def test_churn(self):
|
|
26
|
+
assert np.isclose(churn(self.up, self.up), 0)
|
|
27
|
+
assert np.isclose(churn(self.up, self.up_down), 1)
|
|
28
|
+
assert np.isclose(churn(self.up, self.oscillate), 1)
|
|
29
|
+
assert np.isclose(churn(self.up, self.down), 2)
|
|
30
|
+
self.assertRaisesRegex(
|
|
31
|
+
AssertionError,
|
|
32
|
+
"s2 must have non-zero standard deviation",
|
|
33
|
+
churn,
|
|
34
|
+
self.up,
|
|
35
|
+
self.constant,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def test_churn_tb(self):
|
|
39
|
+
tmp = churn(self.up, self.up, top_bottom=2)
|
|
40
|
+
assert np.isclose(tmp, 0), tmp
|
|
41
|
+
tmp = churn(self.up, self.up_down, top_bottom=2)
|
|
42
|
+
assert np.isclose(tmp, 0.5), tmp
|
|
43
|
+
tmp = churn(self.up, self.oscillate, top_bottom=2)
|
|
44
|
+
assert np.isclose(tmp, 0.5), tmp
|
|
45
|
+
tmp = churn(self.up, self.down, top_bottom=2)
|
|
46
|
+
assert np.isclose(tmp, 1), tmp
|
|
47
|
+
tmp = churn(self.up, self.constant, top_bottom=2)
|
|
48
|
+
assert np.isclose(tmp, 0), tmp
|
|
49
|
+
|
|
50
|
+
def test_turnover(self):
|
|
51
|
+
assert np.isclose(turnover(self.up, self.up), 0)
|
|
52
|
+
assert np.isclose(turnover(self.up, self.up_down), 3)
|
|
53
|
+
assert np.isclose(turnover(self.up, self.oscillate), 4.5)
|
|
54
|
+
assert np.isclose(turnover(self.up, self.down), 6)
|
|
55
|
+
assert np.isclose(turnover(self.up, self.constant), 3.5)
|
|
56
|
+
|
|
57
|
+
def test_churn_first_submission(self):
|
|
58
|
+
"""
|
|
59
|
+
Test that the churn function works for the first submission
|
|
60
|
+
No exceptions should be raised, should return 1
|
|
61
|
+
"""
|
|
62
|
+
fake_universe = generate_fake_universe("20130308")
|
|
63
|
+
fake_submission = generate_new_submission(fake_universe)
|
|
64
|
+
fake_neutralizers = pd.DataFrame(
|
|
65
|
+
{
|
|
66
|
+
"neutralizer_1": [0.1] * len(fake_universe),
|
|
67
|
+
"neutralizer_2": [0.2] * len(fake_universe),
|
|
68
|
+
},
|
|
69
|
+
index=fake_universe["numerai_ticker"],
|
|
70
|
+
)
|
|
71
|
+
fake_sample_weights = pd.Series(
|
|
72
|
+
[0.5] * len(fake_universe),
|
|
73
|
+
index=fake_universe["numerai_ticker"],
|
|
74
|
+
name="sample_weight",
|
|
75
|
+
)
|
|
76
|
+
churn, turnover = calculate_max_churn_and_turnover(
|
|
77
|
+
curr_sub=fake_submission,
|
|
78
|
+
curr_neutralizer=fake_neutralizers,
|
|
79
|
+
curr_weight=fake_sample_weights,
|
|
80
|
+
prev_week_subs=[],
|
|
81
|
+
prev_neutralizers={"20240208": fake_neutralizers},
|
|
82
|
+
prev_sample_weights={"20240208": fake_sample_weights},
|
|
83
|
+
universe=fake_universe.set_index("numerai_ticker").sort_index(),
|
|
84
|
+
curr_signal_col="signal",
|
|
85
|
+
curr_ticker_col="numerai_ticker",
|
|
86
|
+
)
|
|
87
|
+
assert np.isclose(churn, 1)
|
|
88
|
+
assert np.isclose(turnover, 1)
|
|
89
|
+
|
|
90
|
+
def test_churn_handles_different_id_columns(self):
|
|
91
|
+
"""
|
|
92
|
+
Test that the churn function works when
|
|
93
|
+
previous submission has different id columns.
|
|
94
|
+
"""
|
|
95
|
+
fake_universe = generate_fake_universe("20130308")
|
|
96
|
+
fake_submission = generate_new_submission(fake_universe, legacy_headers=True)
|
|
97
|
+
new_fake_universe = generate_fake_universe(
|
|
98
|
+
date_value="20130308", ticker_col="ticker"
|
|
99
|
+
)
|
|
100
|
+
fake_universe["ticker"] = new_fake_universe["ticker"]
|
|
101
|
+
prev_submission = fake_submission.copy()
|
|
102
|
+
fake_neutralizers = pd.DataFrame(
|
|
103
|
+
{
|
|
104
|
+
"neutralizer_1": [0.1] * len(fake_universe),
|
|
105
|
+
"neutralizer_2": [0.2] * len(fake_universe),
|
|
106
|
+
},
|
|
107
|
+
index=fake_universe["numerai_ticker"],
|
|
108
|
+
)
|
|
109
|
+
fake_sample_weights = pd.Series(
|
|
110
|
+
[0.5] * len(fake_universe),
|
|
111
|
+
index=fake_universe["numerai_ticker"],
|
|
112
|
+
name="sample_weight",
|
|
113
|
+
)
|
|
114
|
+
# switch out the numerai_ticke col in-place
|
|
115
|
+
prev_submission["numerai_ticker"] = new_fake_universe["ticker"]
|
|
116
|
+
prev_submission.rename(columns={"numerai_ticker": "ticker"}, inplace=True)
|
|
117
|
+
prev_neutralizers = fake_neutralizers.copy()
|
|
118
|
+
prev_neutralizers.index = new_fake_universe["ticker"]
|
|
119
|
+
prev_neutralizers.index.name = "ticker"
|
|
120
|
+
prev_sample_weights = fake_sample_weights.copy()
|
|
121
|
+
prev_sample_weights.index = new_fake_universe["ticker"]
|
|
122
|
+
prev_sample_weights.index.name = "ticker"
|
|
123
|
+
churn, turnover = calculate_max_churn_and_turnover(
|
|
124
|
+
curr_sub=fake_submission,
|
|
125
|
+
curr_neutralizer=fake_neutralizers,
|
|
126
|
+
curr_weight=fake_sample_weights,
|
|
127
|
+
prev_week_subs={"20240208": prev_submission},
|
|
128
|
+
prev_neutralizers={"20240208": prev_neutralizers},
|
|
129
|
+
prev_sample_weights={"20240208": prev_sample_weights},
|
|
130
|
+
universe=fake_universe.set_index("numerai_ticker").sort_index(),
|
|
131
|
+
curr_signal_col="signal",
|
|
132
|
+
curr_ticker_col="numerai_ticker",
|
|
133
|
+
)
|
|
134
|
+
assert np.isclose(churn, 0)
|
|
135
|
+
assert np.isclose(turnover, 0)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
if __name__ == "__main__":
|
|
139
|
+
unittest.main()
|
|
@@ -155,6 +155,18 @@ class TestSubmissions(unittest.TestCase):
|
|
|
155
155
|
sub[[sub.columns[1]]],
|
|
156
156
|
)
|
|
157
157
|
|
|
158
|
+
def test_validate_headers_signals_data_type_and_date_col(self):
|
|
159
|
+
fake_sub = generate_submission(self.ids, "ticker", "signal")
|
|
160
|
+
fake_sub["data_type"] = "signals"
|
|
161
|
+
fake_sub["friday_date"] = "2023-01-01"
|
|
162
|
+
with self.assertLogs(level="WARNING") as cm:
|
|
163
|
+
assert validate_headers_signals(fake_sub) == ("ticker", "signal")
|
|
164
|
+
self.assertIn(
|
|
165
|
+
"WARNING:numerai_tools.submissions:data_type column found in Signals submission. This is deprecated and will be removed in the future. "
|
|
166
|
+
"Please remove the data_type column from your Signals submission.",
|
|
167
|
+
cm.output[0],
|
|
168
|
+
)
|
|
169
|
+
|
|
158
170
|
def test_validate_headers_crypto(self):
|
|
159
171
|
for sub in self.crypto_subs:
|
|
160
172
|
assert validate_headers_crypto(sub) == tuple(sub.columns)
|
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
from numerai_tools.scoring import (
|
|
2
|
-
filter_sort_index,
|
|
3
|
-
filter_sort_top_bottom,
|
|
4
|
-
spearman_correlation,
|
|
5
|
-
)
|
|
6
|
-
|
|
7
|
-
from typing import List, Tuple, Union, Optional
|
|
8
|
-
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def churn(
|
|
13
|
-
s1: pd.Series,
|
|
14
|
-
s2: pd.Series,
|
|
15
|
-
top_bottom: Optional[int] = None,
|
|
16
|
-
) -> float:
|
|
17
|
-
"""Calculate the churn between two series. Churn is the proportion of elements
|
|
18
|
-
that are different between the two series.
|
|
19
|
-
|
|
20
|
-
For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
|
|
21
|
-
If top_bottom is provided, the churn is calculated as the average of the % of
|
|
22
|
-
tickers that stay in the top and bottom predictions. This is only relevant when
|
|
23
|
-
the series are rank signals and not portfolio weights.
|
|
24
|
-
|
|
25
|
-
Arguments:
|
|
26
|
-
s1: pd.Series - the first series to compare
|
|
27
|
-
s2: pd.Series - the second series to compare
|
|
28
|
-
top_bottom: Optional[int] - the number of top and bottom predictions to use
|
|
29
|
-
when calculating the correlation. Results in
|
|
30
|
-
2*top_bottom predictions.
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
float - the churn between the two series
|
|
34
|
-
"""
|
|
35
|
-
if top_bottom is not None and top_bottom > 0:
|
|
36
|
-
s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
|
|
37
|
-
s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
|
|
38
|
-
top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
|
|
39
|
-
bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
|
|
40
|
-
avg_overlap = (top_overlap + bot_overlap) / 2
|
|
41
|
-
return 1 - avg_overlap
|
|
42
|
-
|
|
43
|
-
s1, s2 = filter_sort_index(s1, s2)
|
|
44
|
-
assert s1.std() > 0, "s1 must have non-zero standard deviation"
|
|
45
|
-
assert s2.std() > 0, "s2 must have non-zero standard deviation"
|
|
46
|
-
return 1 - spearman_correlation(s1, s2)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def turnover(
|
|
50
|
-
s1: pd.Series,
|
|
51
|
-
s2: pd.Series,
|
|
52
|
-
):
|
|
53
|
-
"""Calculate the turnover between two series. Turnover is the total change in weights between
|
|
54
|
-
the two series divided by 2.
|
|
55
|
-
|
|
56
|
-
For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
|
|
57
|
-
and calculate turnover as the absolute total difference between the two series divided by 2.
|
|
58
|
-
This is only relevant when the series are portfolio weights and not rank signals.
|
|
59
|
-
|
|
60
|
-
Arguments:
|
|
61
|
-
s1: pd.Series - the first series to compare
|
|
62
|
-
s2: pd.Series - the second series to compare
|
|
63
|
-
top_bottom: Optional[int] - the number of top and bottom predictions to use
|
|
64
|
-
when calculating the correlation. Results in
|
|
65
|
-
2*top_bottom predictions.
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
float - the turnover between the two series
|
|
69
|
-
"""
|
|
70
|
-
s1, s2 = filter_sort_index(s1, s2)
|
|
71
|
-
turnover = (s1 - s2).abs().sum() / 2
|
|
72
|
-
return turnover
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
|
|
3
|
-
import numpy as np
|
|
4
|
-
import pandas as pd # type: ignore
|
|
5
|
-
|
|
6
|
-
from numerai_tools.signals import churn, turnover
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class TestSignals(unittest.TestCase):
|
|
10
|
-
def setUp(self):
|
|
11
|
-
self.up = pd.Series(list(range(5))).rename("up")
|
|
12
|
-
self.down = pd.Series(list(reversed(range(5)))).rename("down")
|
|
13
|
-
self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
|
|
14
|
-
self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
|
|
15
|
-
self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
|
|
16
|
-
|
|
17
|
-
def test_churn(self):
|
|
18
|
-
assert np.isclose(churn(self.up, self.up), 0)
|
|
19
|
-
assert np.isclose(churn(self.up, self.up_down), 1)
|
|
20
|
-
assert np.isclose(churn(self.up, self.oscillate), 1)
|
|
21
|
-
assert np.isclose(churn(self.up, self.down), 2)
|
|
22
|
-
self.assertRaisesRegex(
|
|
23
|
-
AssertionError,
|
|
24
|
-
"s2 must have non-zero standard deviation",
|
|
25
|
-
churn,
|
|
26
|
-
self.up,
|
|
27
|
-
self.constant,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
def test_churn_tb(self):
|
|
31
|
-
tmp = churn(self.up, self.up, top_bottom=2)
|
|
32
|
-
assert np.isclose(tmp, 0), tmp
|
|
33
|
-
tmp = churn(self.up, self.up_down, top_bottom=2)
|
|
34
|
-
assert np.isclose(tmp, 0.5), tmp
|
|
35
|
-
tmp = churn(self.up, self.oscillate, top_bottom=2)
|
|
36
|
-
assert np.isclose(tmp, 0.5), tmp
|
|
37
|
-
tmp = churn(self.up, self.down, top_bottom=2)
|
|
38
|
-
assert np.isclose(tmp, 1), tmp
|
|
39
|
-
tmp = churn(self.up, self.constant, top_bottom=2)
|
|
40
|
-
assert np.isclose(tmp, 0), tmp
|
|
41
|
-
|
|
42
|
-
def test_turnover(self):
|
|
43
|
-
assert np.isclose(turnover(self.up, self.up), 0)
|
|
44
|
-
assert np.isclose(turnover(self.up, self.up_down), 3)
|
|
45
|
-
assert np.isclose(turnover(self.up, self.oscillate), 4.5)
|
|
46
|
-
assert np.isclose(turnover(self.up, self.down), 6)
|
|
47
|
-
assert np.isclose(turnover(self.up, self.constant), 3.5)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if __name__ == "__main__":
|
|
51
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0.dev0}/numerai_tools.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|