numerai-tools 0.5.0.dev13__tar.gz → 0.6.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/PKG-INFO +1 -1
- numerai_tools-0.6.0.dev0/numerai_tools/data.py +133 -0
- numerai_tools-0.6.0.dev0/numerai_tools/indexing.py +106 -0
- numerai_tools-0.6.0.dev0/numerai_tools/math.py +263 -0
- numerai_tools-0.6.0.dev0/numerai_tools/scoring.py +290 -0
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/numerai_tools/signals.py +43 -21
- numerai_tools-0.6.0.dev0/numerai_tools/typing.py +6 -0
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/pyproject.toml +1 -1
- numerai_tools-0.5.0.dev13/numerai_tools/scoring.py +0 -644
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/LICENSE +0 -0
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/README.md +0 -0
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/numerai_tools/__init__.py +0 -0
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/numerai_tools/py.typed +0 -0
- {numerai_tools-0.5.0.dev13 → numerai_tools-0.6.0.dev0}/numerai_tools/submissions.py +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from typing import List, Union, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
from sklearn.preprocessing import OneHotEncoder # type: ignore
|
|
6
|
+
|
|
7
|
+
from numerai_tools.scoring import tie_kept_rank
|
|
8
|
+
|
|
9
|
+
DEFAULT_BINS = (0.0, 0.25, 0.5, 0.75, 1.0)
|
|
10
|
+
DEFAULT_QUANTILES = (0.05, 0.25, 0.75, 0.95)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def one_hot_encode(
|
|
14
|
+
df: pd.DataFrame, columns: List[str], dtype: type = np.float64
|
|
15
|
+
) -> pd.DataFrame:
|
|
16
|
+
"""One-hot encodes specified columns in a pandas dataframe.
|
|
17
|
+
Each column i should have x_i discrete values (eg. categories, bucket values, etc.)
|
|
18
|
+
and will be converted to x_i columns that each have 0s for rows that don't have
|
|
19
|
+
the associated value and 1s for rows that do have that value.
|
|
20
|
+
|
|
21
|
+
Arguments:
|
|
22
|
+
df: pd.DataFrame - the data with columns to one-hot encode
|
|
23
|
+
columns: List[str] - list of columns names to replace w/ one-hot encoding
|
|
24
|
+
dtype: type = np.float64 - the target datatype for the resulting columns
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
pd.DataFrame - original data, but specified cols replaced w/ one-hot encoding
|
|
28
|
+
"""
|
|
29
|
+
for col in columns:
|
|
30
|
+
encoder = OneHotEncoder(dtype=dtype)
|
|
31
|
+
one_hot = encoder.fit_transform(df[[col]])
|
|
32
|
+
one_hot = pd.DataFrame(
|
|
33
|
+
one_hot.toarray(),
|
|
34
|
+
columns=encoder.get_feature_names_out(),
|
|
35
|
+
index=df.index,
|
|
36
|
+
)
|
|
37
|
+
df = df.join(one_hot).drop(columns=col)
|
|
38
|
+
return df
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def balanced_rank_transform(
|
|
42
|
+
df: pd.DataFrame,
|
|
43
|
+
cols: List[str],
|
|
44
|
+
rank_group: Optional[str] = None,
|
|
45
|
+
rank_filter: Optional[str] = None,
|
|
46
|
+
) -> pd.DataFrame:
|
|
47
|
+
"""
|
|
48
|
+
Perform a balanced rank transformation on specified columns of a DataFrame,
|
|
49
|
+
optionally within groups and with a filter.
|
|
50
|
+
|
|
51
|
+
Parameters
|
|
52
|
+
----------
|
|
53
|
+
df : pd.DataFrame
|
|
54
|
+
Input DataFrame containing the data to be ranked.
|
|
55
|
+
cols : list of str
|
|
56
|
+
List of column names to apply the rank transformation to.
|
|
57
|
+
rank_group : str
|
|
58
|
+
Column name to group by before ranking.
|
|
59
|
+
rank_filter : str, optional
|
|
60
|
+
Column name to filter rows before ranking. Only rows where this column is True
|
|
61
|
+
will be ranked. If None, no filtering is applied.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
pd.DataFrame
|
|
66
|
+
DataFrame with the same index as the input, containing the ranked columns.
|
|
67
|
+
"""
|
|
68
|
+
if rank_filter is not None:
|
|
69
|
+
df = df.loc[df[rank_filter]]
|
|
70
|
+
else:
|
|
71
|
+
df = df
|
|
72
|
+
if rank_group is not None:
|
|
73
|
+
df = df.groupby(rank_group, group_keys=False).apply(
|
|
74
|
+
lambda d: tie_kept_rank(d[cols])
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
df = tie_kept_rank(df[cols])
|
|
78
|
+
return df[cols]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def quantile_bin(
|
|
82
|
+
data: Union[pd.Series, pd.DataFrame],
|
|
83
|
+
bins: tuple[float, ...] = DEFAULT_BINS,
|
|
84
|
+
quantiles: tuple[float, ...] = DEFAULT_QUANTILES,
|
|
85
|
+
) -> pd.DataFrame:
|
|
86
|
+
"""
|
|
87
|
+
Bin a Series or DataFrame into discrete quantile-based bins.
|
|
88
|
+
Handles identical-value columns by assigning all values to the lowest bin.
|
|
89
|
+
|
|
90
|
+
Parameters
|
|
91
|
+
----------
|
|
92
|
+
data : pd.Series or pd.DataFrame
|
|
93
|
+
Data to bin.
|
|
94
|
+
bins : list of float
|
|
95
|
+
Values to assign to each bin.
|
|
96
|
+
quantiles : list of float
|
|
97
|
+
Quantile thresholds to use for binning (len = number of bins - 1)
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
pd.DataFrame
|
|
102
|
+
Binned values, same shape as input.
|
|
103
|
+
"""
|
|
104
|
+
assert len(bins), "Invalid bins! Must not be empty."
|
|
105
|
+
assert len(quantiles), "Invalid quantiles! Must not be empty."
|
|
106
|
+
assert len(quantiles) == (
|
|
107
|
+
len(bins) - 1
|
|
108
|
+
), "Invalid quantiles! Length must be 1 less than bins."
|
|
109
|
+
|
|
110
|
+
if isinstance(data, pd.Series):
|
|
111
|
+
data = data.to_frame(name="value")
|
|
112
|
+
|
|
113
|
+
binned = data.copy()
|
|
114
|
+
for col in binned.columns:
|
|
115
|
+
s = binned[col].astype(float)
|
|
116
|
+
|
|
117
|
+
# handle all-identical values
|
|
118
|
+
if s.nunique() <= 1:
|
|
119
|
+
binned[col] = 0.0
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
# calculate quantile thresholds
|
|
123
|
+
q = s.quantile(quantiles)
|
|
124
|
+
|
|
125
|
+
# assign bins according to quantiles
|
|
126
|
+
s.loc[s <= q[quantiles[0]]] = bins[0]
|
|
127
|
+
for i in range(1, len(bins) - 1):
|
|
128
|
+
s.loc[(s > q[quantiles[i - 1]]) & (s <= q[quantiles[i]])] = bins[i]
|
|
129
|
+
s.loc[s >= q[quantiles[-1]]] = bins[-1]
|
|
130
|
+
|
|
131
|
+
binned[col] = s.astype(float)
|
|
132
|
+
|
|
133
|
+
return binned
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import List, Tuple, cast, Any
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
# leaving this here for backwards compatibility
|
|
7
|
+
from numerai_tools.typing import S1, S2
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# sometimes when we match up the target/prediction indices,
|
|
11
|
+
# changes in stock universe causes some stocks to enter / leave,
|
|
12
|
+
# this ensures we don't filter too much
|
|
13
|
+
DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def filter_sort_index(
|
|
17
|
+
s1: S1, s2: S2, max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
|
|
18
|
+
) -> Tuple[S1, S2]:
|
|
19
|
+
"""Filters the indices of the given series to match each other,
|
|
20
|
+
then sorts the indices, then checks that we didn't filter too many indices
|
|
21
|
+
before returning the filtered and sorted series.
|
|
22
|
+
|
|
23
|
+
Arguments:
|
|
24
|
+
s1: Union[pd.DataFrame, pd.Series] - the first dataset to filter and sort
|
|
25
|
+
s2: Union[pd.DataFrame, pd.Series] - the second dataset to filter and sort
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple[
|
|
29
|
+
Union[pd.DataFrame, pd.Series],
|
|
30
|
+
Union[pd.DataFrame, pd.Series],
|
|
31
|
+
] - the filtered and sorted datasets
|
|
32
|
+
"""
|
|
33
|
+
ids = s1.dropna().index.intersection(s2.dropna().index)
|
|
34
|
+
# ensure we didn't filter too many ids
|
|
35
|
+
assert len(ids) / len(s1) >= (1 - max_filtered_ratio), (
|
|
36
|
+
"s1 does not have enough overlapping ids with s2,"
|
|
37
|
+
f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
|
|
38
|
+
)
|
|
39
|
+
assert len(ids) / len(s2) >= (1 - max_filtered_ratio), (
|
|
40
|
+
"s2 does not have enough overlapping ids with s1,"
|
|
41
|
+
f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
|
|
42
|
+
)
|
|
43
|
+
return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def filter_sort_index_many(
|
|
47
|
+
inputs: List[Any],
|
|
48
|
+
max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
|
|
49
|
+
) -> List[Any]:
|
|
50
|
+
"""Filters the indices of the given list of series to match each other,
|
|
51
|
+
then sorts the indices, then checks that we didn't filter too many indices
|
|
52
|
+
before returning the filtered and sorted series.
|
|
53
|
+
|
|
54
|
+
Arguments:
|
|
55
|
+
inputs: List[Union[pd.DataFrame, pd.Series]] - the list of datasets to filter and sort
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
List[Union[pd.DataFrame, pd.Series]] - the filtered and sorted datasets
|
|
59
|
+
"""
|
|
60
|
+
assert len(inputs) > 0, "List must contain at least one element"
|
|
61
|
+
ids = inputs[0].dropna().index
|
|
62
|
+
for i in range(1, len(inputs)):
|
|
63
|
+
ids = ids.intersection(inputs[i].dropna().index)
|
|
64
|
+
result = [inputs[i].loc[ids].sort_index() for i in range(len(inputs))]
|
|
65
|
+
# ensure we didn't filter too many ids
|
|
66
|
+
for i in range(len(result)):
|
|
67
|
+
assert len(result[i]) / len(inputs[i]) >= (1 - max_filtered_ratio), (
|
|
68
|
+
f"inputs[{i}] does not have enough overlapping ids with the others,"
|
|
69
|
+
f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
|
|
70
|
+
)
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def filter_sort_top_bottom(
|
|
75
|
+
s: pd.Series, top_bottom: int
|
|
76
|
+
) -> Tuple[pd.Series, pd.Series]:
|
|
77
|
+
"""Filters the series according to the top n and bottom n values
|
|
78
|
+
then sorts the index and returns two filtered and sorted series
|
|
79
|
+
for the top and bottom values respectively.
|
|
80
|
+
|
|
81
|
+
Arguments:
|
|
82
|
+
s: pd.Series - the data to filter and sort
|
|
83
|
+
top_bottom: int - the number of top n and bottom n values to keep
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
|
|
87
|
+
"""
|
|
88
|
+
tb_idx = np.argsort(s, kind="stable")
|
|
89
|
+
bot = s.iloc[tb_idx[:top_bottom]]
|
|
90
|
+
top = s.iloc[tb_idx[-top_bottom:]]
|
|
91
|
+
return top.sort_index(), bot.sort_index()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
|
|
95
|
+
"""Similar to filter_sort_top_bottom, but concatenates the top and bottom series
|
|
96
|
+
into 1 series and then sorts the index.
|
|
97
|
+
|
|
98
|
+
Arguments:
|
|
99
|
+
s: pd.Series - the data to filter and sort
|
|
100
|
+
top_bottom: int - the number of top n and bottom n values to keep
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
pd.Series - the concatenated and sorted series of top and bottom values
|
|
104
|
+
"""
|
|
105
|
+
top, bot = filter_sort_top_bottom(s, top_bottom)
|
|
106
|
+
return pd.concat([top, bot]).sort_index()
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
from typing import Optional, cast, Literal
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from scipy import stats
|
|
6
|
+
|
|
7
|
+
# leaving this here for backwards compatibility
|
|
8
|
+
from numerai_tools.typing import S1
|
|
9
|
+
|
|
10
|
+
from numerai_tools.indexing import (
|
|
11
|
+
filter_sort_index,
|
|
12
|
+
filter_sort_top_bottom_concat,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
|
|
20
|
+
"""Percentile rank a pandas Series, centering values around 0.5.
|
|
21
|
+
|
|
22
|
+
Arguments:
|
|
23
|
+
s: pd.Series - the data to rank
|
|
24
|
+
method: str - the pandas ranking method to use, options:
|
|
25
|
+
'average' (default) - keeps ties
|
|
26
|
+
'first' - breaks ties by index
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
pd.Series - the ranked Series
|
|
30
|
+
"""
|
|
31
|
+
assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
|
|
32
|
+
# Ensure denominator is at least 1 to avoid division by zero
|
|
33
|
+
denom = max(int(s.count()), 1)
|
|
34
|
+
return (s.rank(method=method) - 0.5) / denom
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
|
|
38
|
+
"""Percentile rank each columns or series, centering values around 0.5
|
|
39
|
+
|
|
40
|
+
Arguments:
|
|
41
|
+
s: pd.DataFrame | pd.Series - the data to rank
|
|
42
|
+
method: str - the pandas ranking method to use, options:
|
|
43
|
+
'average' (default) - keeps ties
|
|
44
|
+
'first' - breaks ties by index
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
pd.DataFrame | pd.Series - the ranked input data
|
|
48
|
+
"""
|
|
49
|
+
if isinstance(s, pd.Series):
|
|
50
|
+
return cast(S1, rank_series(s, method))
|
|
51
|
+
else:
|
|
52
|
+
return s.apply(lambda series: rank(series, method=method))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
|
|
56
|
+
"""Rank columns, breaking ties by index."""
|
|
57
|
+
return rank(df, "first")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def tie_kept_rank(s: S1) -> S1:
|
|
61
|
+
"""Rank columns, but keep ties."""
|
|
62
|
+
return cast(S1, rank(s, "average"))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def min_max_normalize(s: pd.Series) -> pd.Series:
|
|
66
|
+
"""Scale a series to be between 0 and 1."""
|
|
67
|
+
return (s - s.min()) / (s.max() - s.min())
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
|
|
71
|
+
"""Scale a df such that all columns have std == 1."""
|
|
72
|
+
return df / np.std(df, axis=0)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def weight_normalize(s: S1) -> S1:
|
|
76
|
+
"""Scale a input such that all columns have absolute value sum == 1."""
|
|
77
|
+
return cast(S1, s / s.abs().sum(axis=0))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def center(s: S1) -> S1:
|
|
81
|
+
"""Shift the input such that all columns have mean == 0."""
|
|
82
|
+
return cast(S1, s - s.mean())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def standardize(df: pd.DataFrame) -> pd.DataFrame:
|
|
86
|
+
"""Scale a df such that all columns have mean == 0 and std == 1."""
|
|
87
|
+
return variance_normalize(center(df))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def validate_indices(live_targets: pd.Series, predictions: pd.Series) -> None:
|
|
91
|
+
# ensure the ids are equivalent and sorted
|
|
92
|
+
assert np.array_equal(predictions.index, live_targets.index.sort_values())
|
|
93
|
+
assert np.array_equal(live_targets.index, live_targets.index.sort_values())
|
|
94
|
+
assert np.array_equal(predictions.index, predictions.index.sort_values())
|
|
95
|
+
# ensure no nans
|
|
96
|
+
assert not predictions.isna().any()
|
|
97
|
+
assert not live_targets.isna().any()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
|
|
101
|
+
validate_indices(live_targets, predictions)
|
|
102
|
+
# calculate correlation coefficient
|
|
103
|
+
return np.corrcoef(live_targets, predictions)[0, 1]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def tie_broken_rank_correlation(target: pd.Series, predictions: pd.Series) -> float:
|
|
107
|
+
# percentile rank the predictions and get the correlation with the target
|
|
108
|
+
ranked_predictions = tie_broken_rank(predictions.to_frame())[predictions.name]
|
|
109
|
+
return correlation(target, ranked_predictions)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def spearman_correlation(target: pd.Series, predictions: pd.Series) -> float:
|
|
113
|
+
validate_indices(target, predictions)
|
|
114
|
+
return target.corr(predictions, method="spearman")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def pearson_correlation(
|
|
118
|
+
target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
|
|
119
|
+
) -> float:
|
|
120
|
+
if top_bottom is not None and top_bottom > 0:
|
|
121
|
+
predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
|
|
122
|
+
target, predictions = filter_sort_index(
|
|
123
|
+
target, predictions, (1 - top_bottom / len(target))
|
|
124
|
+
)
|
|
125
|
+
validate_indices(target, predictions)
|
|
126
|
+
return target.corr(predictions, method="pearson")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def sharpe_ratio(s: pd.Series) -> float:
|
|
130
|
+
# calculate the sharpe ratio of a series
|
|
131
|
+
return np.mean(s) / np.std(s)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def gaussian(df: pd.DataFrame) -> pd.DataFrame:
|
|
135
|
+
"""Gaussianize each column of a pandas DataFrame using a normal percent point func.
|
|
136
|
+
Effectively scales each column such that mean == 0 and std == 1.
|
|
137
|
+
|
|
138
|
+
Arguments:
|
|
139
|
+
df: pd.DataFrame - the data to gaussianize
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
pd.DataFrame - the gaussianized data
|
|
143
|
+
"""
|
|
144
|
+
assert np.array_equal(df.index.sort_values(), df.index)
|
|
145
|
+
return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
|
|
149
|
+
"""Raise given predictions series to the given power.
|
|
150
|
+
|
|
151
|
+
Arguments:
|
|
152
|
+
df: pd.DataFrame - the data to raise to the given power
|
|
153
|
+
p: float - the power to which we exponentiate the data
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
pd.DataFrame - the predictions raised to the given power,
|
|
157
|
+
each column should be at least 90% correlated with the original data
|
|
158
|
+
"""
|
|
159
|
+
assert not df.isna().any().any(), "Data contains NaNs"
|
|
160
|
+
assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
|
|
161
|
+
result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
|
|
162
|
+
assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
|
|
163
|
+
return result
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def tie_kept_rank__gaussianize__pow_1_5(df: pd.DataFrame) -> pd.DataFrame:
|
|
167
|
+
"""Perform the 3 functions in order on the given pandas DataFrame.
|
|
168
|
+
Will tie-kept rank then gaussianize then exponentiate to the 1.5 power.
|
|
169
|
+
|
|
170
|
+
Arguments:
|
|
171
|
+
df: pd.DataFrame - the data to transform
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
pd.DataFrame - the resulting data after applying the 3 functions
|
|
175
|
+
"""
|
|
176
|
+
return power(gaussian(tie_kept_rank(df)), 1.5)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def tie_kept_rank__gaussianize__neutralize__variance_normalize(
|
|
180
|
+
df: pd.DataFrame, neutralizers: pd.DataFrame
|
|
181
|
+
) -> pd.DataFrame:
|
|
182
|
+
"""Perform the 4 functions in order on the given pandas DataFrame.
|
|
183
|
+
1. tie-kept rank each column
|
|
184
|
+
2. gaussianize each column
|
|
185
|
+
3. neutralize each column to the neutralizers
|
|
186
|
+
4. variance normalize each column
|
|
187
|
+
|
|
188
|
+
Arguments:
|
|
189
|
+
df: pd.DataFrame - the data to transform
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
pd.DataFrame - the resulting data after applying the 3 functions
|
|
193
|
+
"""
|
|
194
|
+
return variance_normalize(neutralize(gaussian(tie_kept_rank(df)), neutralizers))
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
|
|
198
|
+
"""Orthogonalizes v with respect to u by projecting v onto u,
|
|
199
|
+
then subtracting that projection from v.
|
|
200
|
+
|
|
201
|
+
This will reach the same result as the neutralize
|
|
202
|
+
function when v and u are centered single column vectors,
|
|
203
|
+
but this is much faster.
|
|
204
|
+
|
|
205
|
+
Arguments:
|
|
206
|
+
v: np.ndarray - the vector to orthogonalize
|
|
207
|
+
u: np.ndarray - the vector orthogonalize v
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
np.ndarray - the orthogonalized vector v
|
|
211
|
+
"""
|
|
212
|
+
return v - np.outer(u, (v.T @ u) / (u.T @ u))
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def stake_weight(
|
|
216
|
+
predictions: pd.DataFrame,
|
|
217
|
+
stakes: pd.Series,
|
|
218
|
+
) -> pd.Series:
|
|
219
|
+
"""Create a stake-weighted meta model from the given predictions and stakes.
|
|
220
|
+
|
|
221
|
+
Arguments:
|
|
222
|
+
predictions: pd.DataFrame - the predictions to weight
|
|
223
|
+
stakes: pd.Series - the stakes to use as weights
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
pd.Series - the stake-weighted meta model
|
|
227
|
+
"""
|
|
228
|
+
return (predictions[stakes.index] * stakes).sum(axis=1) / stakes.sum()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def neutralize(
|
|
232
|
+
df: pd.DataFrame,
|
|
233
|
+
neutralizers: pd.DataFrame,
|
|
234
|
+
proportion: float = 1.0,
|
|
235
|
+
) -> pd.DataFrame:
|
|
236
|
+
"""Neutralize each column of a given DataFrame by each feature in a given
|
|
237
|
+
neutralizers DataFrame. Neutralization uses least-squares regression to
|
|
238
|
+
find the orthogonal projection of each column onto the neutralizers, then
|
|
239
|
+
subtracts the result from the original predictions.
|
|
240
|
+
|
|
241
|
+
Arguments:
|
|
242
|
+
df: pd.DataFrame - the data with columns to neutralize
|
|
243
|
+
neutralizers: pd.DataFrame - the neutralizer data with features as columns
|
|
244
|
+
proportion: float - the degree to which neutralization occurs
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
pd.DataFrame - the neutralized data
|
|
248
|
+
"""
|
|
249
|
+
assert not df.isna().any().any(), "Data contains NaNs"
|
|
250
|
+
assert not neutralizers.isna().any().any(), "Neutralizers contain NaNs"
|
|
251
|
+
assert len(df.index) == len(neutralizers.index), "Indices don't match"
|
|
252
|
+
assert (df.index == neutralizers.index).all(), "Indices don't match"
|
|
253
|
+
df[df.columns[df.std() == 0]] = np.nan
|
|
254
|
+
df_arr = df.values
|
|
255
|
+
neutralizer_arr = neutralizers.values
|
|
256
|
+
neutralizer_arr = np.hstack(
|
|
257
|
+
# add a column of 1s to the neutralizer array in case neutralizer_arr is a single column
|
|
258
|
+
(neutralizer_arr, np.array([1] * len(neutralizer_arr)).reshape(-1, 1))
|
|
259
|
+
)
|
|
260
|
+
least_squares = np.linalg.lstsq(neutralizer_arr, df_arr, rcond=1e-6)[0]
|
|
261
|
+
adjustments = proportion * neutralizer_arr.dot(least_squares)
|
|
262
|
+
neutral = df_arr - adjustments
|
|
263
|
+
return pd.DataFrame(neutral, index=df.index, columns=df.columns)
|