numerai-tools 0.5.0.dev13__tar.gz → 0.6.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev13
3
+ Version: 0.6.0.dev0
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
5
  License: MIT
6
6
  Author: Numerai Engineering
@@ -0,0 +1,133 @@
1
+ from typing import List, Union, Optional
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.preprocessing import OneHotEncoder # type: ignore
6
+
7
+ from numerai_tools.scoring import tie_kept_rank
8
+
9
+ DEFAULT_BINS = (0.0, 0.25, 0.5, 0.75, 1.0)
10
+ DEFAULT_QUANTILES = (0.05, 0.25, 0.75, 0.95)
11
+
12
+
13
+ def one_hot_encode(
14
+ df: pd.DataFrame, columns: List[str], dtype: type = np.float64
15
+ ) -> pd.DataFrame:
16
+ """One-hot encodes specified columns in a pandas dataframe.
17
+ Each column i should have x_i discrete values (eg. categories, bucket values, etc.)
18
+ and will be converted to x_i columns that each have 0s for rows that don't have
19
+ the associated value and 1s for rows that do have that value.
20
+
21
+ Arguments:
22
+ df: pd.DataFrame - the data with columns to one-hot encode
23
+ columns: List[str] - list of columns names to replace w/ one-hot encoding
24
+ dtype: type = np.float64 - the target datatype for the resulting columns
25
+
26
+ Returns:
27
+ pd.DataFrame - original data, but specified cols replaced w/ one-hot encoding
28
+ """
29
+ for col in columns:
30
+ encoder = OneHotEncoder(dtype=dtype)
31
+ one_hot = encoder.fit_transform(df[[col]])
32
+ one_hot = pd.DataFrame(
33
+ one_hot.toarray(),
34
+ columns=encoder.get_feature_names_out(),
35
+ index=df.index,
36
+ )
37
+ df = df.join(one_hot).drop(columns=col)
38
+ return df
39
+
40
+
41
+ def balanced_rank_transform(
42
+ df: pd.DataFrame,
43
+ cols: List[str],
44
+ rank_group: Optional[str] = None,
45
+ rank_filter: Optional[str] = None,
46
+ ) -> pd.DataFrame:
47
+ """
48
+ Perform a balanced rank transformation on specified columns of a DataFrame,
49
+ optionally within groups and with a filter.
50
+
51
+ Parameters
52
+ ----------
53
+ df : pd.DataFrame
54
+ Input DataFrame containing the data to be ranked.
55
+ cols : list of str
56
+ List of column names to apply the rank transformation to.
57
+ rank_group : str
58
+ Column name to group by before ranking.
59
+ rank_filter : str, optional
60
+ Column name to filter rows before ranking. Only rows where this column is True
61
+ will be ranked. If None, no filtering is applied.
62
+
63
+ Returns
64
+ -------
65
+ pd.DataFrame
66
+ DataFrame with the same index as the input, containing the ranked columns.
67
+ """
68
+ if rank_filter is not None:
69
+ df = df.loc[df[rank_filter]]
70
+ else:
71
+ df = df
72
+ if rank_group is not None:
73
+ df = df.groupby(rank_group, group_keys=False).apply(
74
+ lambda d: tie_kept_rank(d[cols])
75
+ )
76
+ else:
77
+ df = tie_kept_rank(df[cols])
78
+ return df[cols]
79
+
80
+
81
+ def quantile_bin(
82
+ data: Union[pd.Series, pd.DataFrame],
83
+ bins: tuple[float, ...] = DEFAULT_BINS,
84
+ quantiles: tuple[float, ...] = DEFAULT_QUANTILES,
85
+ ) -> pd.DataFrame:
86
+ """
87
+ Bin a Series or DataFrame into discrete quantile-based bins.
88
+ Handles identical-value columns by assigning all values to the lowest bin.
89
+
90
+ Parameters
91
+ ----------
92
+ data : pd.Series or pd.DataFrame
93
+ Data to bin.
94
+ bins : list of float
95
+ Values to assign to each bin.
96
+ quantiles : list of float
97
+ Quantile thresholds to use for binning (len = number of bins - 1)
98
+
99
+ Returns
100
+ -------
101
+ pd.DataFrame
102
+ Binned values, same shape as input.
103
+ """
104
+ assert len(bins), "Invalid bins! Must not be empty."
105
+ assert len(quantiles), "Invalid quantiles! Must not be empty."
106
+ assert len(quantiles) == (
107
+ len(bins) - 1
108
+ ), "Invalid quantiles! Length must be 1 less than bins."
109
+
110
+ if isinstance(data, pd.Series):
111
+ data = data.to_frame(name="value")
112
+
113
+ binned = data.copy()
114
+ for col in binned.columns:
115
+ s = binned[col].astype(float)
116
+
117
+ # handle all-identical values
118
+ if s.nunique() <= 1:
119
+ binned[col] = 0.0
120
+ continue
121
+
122
+ # calculate quantile thresholds
123
+ q = s.quantile(quantiles)
124
+
125
+ # assign bins according to quantiles
126
+ s.loc[s <= q[quantiles[0]]] = bins[0]
127
+ for i in range(1, len(bins) - 1):
128
+ s.loc[(s > q[quantiles[i - 1]]) & (s <= q[quantiles[i]])] = bins[i]
129
+ s.loc[s >= q[quantiles[-1]]] = bins[-1]
130
+
131
+ binned[col] = s.astype(float)
132
+
133
+ return binned
@@ -0,0 +1,106 @@
1
+ from typing import List, Tuple, cast, Any
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ # leaving this here for backwards compatibility
7
+ from numerai_tools.typing import S1, S2
8
+
9
+
10
+ # sometimes when we match up the target/prediction indices,
11
+ # changes in stock universe causes some stocks to enter / leave,
12
+ # this ensures we don't filter too much
13
+ DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
14
+
15
+
16
+ def filter_sort_index(
17
+ s1: S1, s2: S2, max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
18
+ ) -> Tuple[S1, S2]:
19
+ """Filters the indices of the given series to match each other,
20
+ then sorts the indices, then checks that we didn't filter too many indices
21
+ before returning the filtered and sorted series.
22
+
23
+ Arguments:
24
+ s1: Union[pd.DataFrame, pd.Series] - the first dataset to filter and sort
25
+ s2: Union[pd.DataFrame, pd.Series] - the second dataset to filter and sort
26
+
27
+ Returns:
28
+ Tuple[
29
+ Union[pd.DataFrame, pd.Series],
30
+ Union[pd.DataFrame, pd.Series],
31
+ ] - the filtered and sorted datasets
32
+ """
33
+ ids = s1.dropna().index.intersection(s2.dropna().index)
34
+ # ensure we didn't filter too many ids
35
+ assert len(ids) / len(s1) >= (1 - max_filtered_ratio), (
36
+ "s1 does not have enough overlapping ids with s2,"
37
+ f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
38
+ )
39
+ assert len(ids) / len(s2) >= (1 - max_filtered_ratio), (
40
+ "s2 does not have enough overlapping ids with s1,"
41
+ f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
42
+ )
43
+ return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
44
+
45
+
46
+ def filter_sort_index_many(
47
+ inputs: List[Any],
48
+ max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
49
+ ) -> List[Any]:
50
+ """Filters the indices of the given list of series to match each other,
51
+ then sorts the indices, then checks that we didn't filter too many indices
52
+ before returning the filtered and sorted series.
53
+
54
+ Arguments:
55
+ inputs: List[Union[pd.DataFrame, pd.Series]] - the list of datasets to filter and sort
56
+
57
+ Returns:
58
+ List[Union[pd.DataFrame, pd.Series]] - the filtered and sorted datasets
59
+ """
60
+ assert len(inputs) > 0, "List must contain at least one element"
61
+ ids = inputs[0].dropna().index
62
+ for i in range(1, len(inputs)):
63
+ ids = ids.intersection(inputs[i].dropna().index)
64
+ result = [inputs[i].loc[ids].sort_index() for i in range(len(inputs))]
65
+ # ensure we didn't filter too many ids
66
+ for i in range(len(result)):
67
+ assert len(result[i]) / len(inputs[i]) >= (1 - max_filtered_ratio), (
68
+ f"inputs[{i}] does not have enough overlapping ids with the others,"
69
+ f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
70
+ )
71
+ return result
72
+
73
+
74
+ def filter_sort_top_bottom(
75
+ s: pd.Series, top_bottom: int
76
+ ) -> Tuple[pd.Series, pd.Series]:
77
+ """Filters the series according to the top n and bottom n values
78
+ then sorts the index and returns two filtered and sorted series
79
+ for the top and bottom values respectively.
80
+
81
+ Arguments:
82
+ s: pd.Series - the data to filter and sort
83
+ top_bottom: int - the number of top n and bottom n values to keep
84
+
85
+ Returns:
86
+ Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
87
+ """
88
+ tb_idx = np.argsort(s, kind="stable")
89
+ bot = s.iloc[tb_idx[:top_bottom]]
90
+ top = s.iloc[tb_idx[-top_bottom:]]
91
+ return top.sort_index(), bot.sort_index()
92
+
93
+
94
+ def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
95
+ """Similar to filter_sort_top_bottom, but concatenates the top and bottom series
96
+ into 1 series and then sorts the index.
97
+
98
+ Arguments:
99
+ s: pd.Series - the data to filter and sort
100
+ top_bottom: int - the number of top n and bottom n values to keep
101
+
102
+ Returns:
103
+ pd.Series - the concatenated and sorted series of top and bottom values
104
+ """
105
+ top, bot = filter_sort_top_bottom(s, top_bottom)
106
+ return pd.concat([top, bot]).sort_index()
@@ -0,0 +1,263 @@
1
+ from typing import Optional, cast, Literal
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy import stats
6
+
7
+ # leaving this here for backwards compatibility
8
+ from numerai_tools.typing import S1
9
+
10
+ from numerai_tools.indexing import (
11
+ filter_sort_index,
12
+ filter_sort_top_bottom_concat,
13
+ )
14
+
15
+
16
+ RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
17
+
18
+
19
+ def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
20
+ """Percentile rank a pandas Series, centering values around 0.5.
21
+
22
+ Arguments:
23
+ s: pd.Series - the data to rank
24
+ method: str - the pandas ranking method to use, options:
25
+ 'average' (default) - keeps ties
26
+ 'first' - breaks ties by index
27
+
28
+ Returns:
29
+ pd.Series - the ranked Series
30
+ """
31
+ assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
32
+ # Ensure denominator is at least 1 to avoid division by zero
33
+ denom = max(int(s.count()), 1)
34
+ return (s.rank(method=method) - 0.5) / denom
35
+
36
+
37
+ def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
38
+ """Percentile rank each columns or series, centering values around 0.5
39
+
40
+ Arguments:
41
+ s: pd.DataFrame | pd.Series - the data to rank
42
+ method: str - the pandas ranking method to use, options:
43
+ 'average' (default) - keeps ties
44
+ 'first' - breaks ties by index
45
+
46
+ Returns:
47
+ pd.DataFrame | pd.Series - the ranked input data
48
+ """
49
+ if isinstance(s, pd.Series):
50
+ return cast(S1, rank_series(s, method))
51
+ else:
52
+ return s.apply(lambda series: rank(series, method=method))
53
+
54
+
55
+ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
56
+ """Rank columns, breaking ties by index."""
57
+ return rank(df, "first")
58
+
59
+
60
+ def tie_kept_rank(s: S1) -> S1:
61
+ """Rank columns, but keep ties."""
62
+ return cast(S1, rank(s, "average"))
63
+
64
+
65
+ def min_max_normalize(s: pd.Series) -> pd.Series:
66
+ """Scale a series to be between 0 and 1."""
67
+ return (s - s.min()) / (s.max() - s.min())
68
+
69
+
70
+ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
71
+ """Scale a df such that all columns have std == 1."""
72
+ return df / np.std(df, axis=0)
73
+
74
+
75
+ def weight_normalize(s: S1) -> S1:
76
+ """Scale a input such that all columns have absolute value sum == 1."""
77
+ return cast(S1, s / s.abs().sum(axis=0))
78
+
79
+
80
+ def center(s: S1) -> S1:
81
+ """Shift the input such that all columns have mean == 0."""
82
+ return cast(S1, s - s.mean())
83
+
84
+
85
+ def standardize(df: pd.DataFrame) -> pd.DataFrame:
86
+ """Scale a df such that all columns have mean == 0 and std == 1."""
87
+ return variance_normalize(center(df))
88
+
89
+
90
+ def validate_indices(live_targets: pd.Series, predictions: pd.Series) -> None:
91
+ # ensure the ids are equivalent and sorted
92
+ assert np.array_equal(predictions.index, live_targets.index.sort_values())
93
+ assert np.array_equal(live_targets.index, live_targets.index.sort_values())
94
+ assert np.array_equal(predictions.index, predictions.index.sort_values())
95
+ # ensure no nans
96
+ assert not predictions.isna().any()
97
+ assert not live_targets.isna().any()
98
+
99
+
100
+ def correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
101
+ validate_indices(live_targets, predictions)
102
+ # calculate correlation coefficient
103
+ return np.corrcoef(live_targets, predictions)[0, 1]
104
+
105
+
106
+ def tie_broken_rank_correlation(target: pd.Series, predictions: pd.Series) -> float:
107
+ # percentile rank the predictions and get the correlation with the target
108
+ ranked_predictions = tie_broken_rank(predictions.to_frame())[predictions.name]
109
+ return correlation(target, ranked_predictions)
110
+
111
+
112
+ def spearman_correlation(target: pd.Series, predictions: pd.Series) -> float:
113
+ validate_indices(target, predictions)
114
+ return target.corr(predictions, method="spearman")
115
+
116
+
117
+ def pearson_correlation(
118
+ target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
119
+ ) -> float:
120
+ if top_bottom is not None and top_bottom > 0:
121
+ predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
122
+ target, predictions = filter_sort_index(
123
+ target, predictions, (1 - top_bottom / len(target))
124
+ )
125
+ validate_indices(target, predictions)
126
+ return target.corr(predictions, method="pearson")
127
+
128
+
129
+ def sharpe_ratio(s: pd.Series) -> float:
130
+ # calculate the sharpe ratio of a series
131
+ return np.mean(s) / np.std(s)
132
+
133
+
134
+ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
135
+ """Gaussianize each column of a pandas DataFrame using a normal percent point func.
136
+ Effectively scales each column such that mean == 0 and std == 1.
137
+
138
+ Arguments:
139
+ df: pd.DataFrame - the data to gaussianize
140
+
141
+ Returns:
142
+ pd.DataFrame - the gaussianized data
143
+ """
144
+ assert np.array_equal(df.index.sort_values(), df.index)
145
+ return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
146
+
147
+
148
+ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
149
+ """Raise given predictions series to the given power.
150
+
151
+ Arguments:
152
+ df: pd.DataFrame - the data to raise to the given power
153
+ p: float - the power to which we exponentiate the data
154
+
155
+ Returns:
156
+ pd.DataFrame - the predictions raised to the given power,
157
+ each column should be at least 90% correlated with the original data
158
+ """
159
+ assert not df.isna().any().any(), "Data contains NaNs"
160
+ assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
161
+ result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
162
+ assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
163
+ return result
164
+
165
+
166
+ def tie_kept_rank__gaussianize__pow_1_5(df: pd.DataFrame) -> pd.DataFrame:
167
+ """Perform the 3 functions in order on the given pandas DataFrame.
168
+ Will tie-kept rank then gaussianize then exponentiate to the 1.5 power.
169
+
170
+ Arguments:
171
+ df: pd.DataFrame - the data to transform
172
+
173
+ Returns:
174
+ pd.DataFrame - the resulting data after applying the 3 functions
175
+ """
176
+ return power(gaussian(tie_kept_rank(df)), 1.5)
177
+
178
+
179
+ def tie_kept_rank__gaussianize__neutralize__variance_normalize(
180
+ df: pd.DataFrame, neutralizers: pd.DataFrame
181
+ ) -> pd.DataFrame:
182
+ """Perform the 4 functions in order on the given pandas DataFrame.
183
+ 1. tie-kept rank each column
184
+ 2. gaussianize each column
185
+ 3. neutralize each column to the neutralizers
186
+ 4. variance normalize each column
187
+
188
+ Arguments:
189
+ df: pd.DataFrame - the data to transform
190
+
191
+ Returns:
192
+ pd.DataFrame - the resulting data after applying the 3 functions
193
+ """
194
+ return variance_normalize(neutralize(gaussian(tie_kept_rank(df)), neutralizers))
195
+
196
+
197
+ def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
198
+ """Orthogonalizes v with respect to u by projecting v onto u,
199
+ then subtracting that projection from v.
200
+
201
+ This will reach the same result as the neutralize
202
+ function when v and u are centered single column vectors,
203
+ but this is much faster.
204
+
205
+ Arguments:
206
+ v: np.ndarray - the vector to orthogonalize
207
+ u: np.ndarray - the vector orthogonalize v
208
+
209
+ Returns:
210
+ np.ndarray - the orthogonalized vector v
211
+ """
212
+ return v - np.outer(u, (v.T @ u) / (u.T @ u))
213
+
214
+
215
+ def stake_weight(
216
+ predictions: pd.DataFrame,
217
+ stakes: pd.Series,
218
+ ) -> pd.Series:
219
+ """Create a stake-weighted meta model from the given predictions and stakes.
220
+
221
+ Arguments:
222
+ predictions: pd.DataFrame - the predictions to weight
223
+ stakes: pd.Series - the stakes to use as weights
224
+
225
+ Returns:
226
+ pd.Series - the stake-weighted meta model
227
+ """
228
+ return (predictions[stakes.index] * stakes).sum(axis=1) / stakes.sum()
229
+
230
+
231
+ def neutralize(
232
+ df: pd.DataFrame,
233
+ neutralizers: pd.DataFrame,
234
+ proportion: float = 1.0,
235
+ ) -> pd.DataFrame:
236
+ """Neutralize each column of a given DataFrame by each feature in a given
237
+ neutralizers DataFrame. Neutralization uses least-squares regression to
238
+ find the orthogonal projection of each column onto the neutralizers, then
239
+ subtracts the result from the original predictions.
240
+
241
+ Arguments:
242
+ df: pd.DataFrame - the data with columns to neutralize
243
+ neutralizers: pd.DataFrame - the neutralizer data with features as columns
244
+ proportion: float - the degree to which neutralization occurs
245
+
246
+ Returns:
247
+ pd.DataFrame - the neutralized data
248
+ """
249
+ assert not df.isna().any().any(), "Data contains NaNs"
250
+ assert not neutralizers.isna().any().any(), "Neutralizers contain NaNs"
251
+ assert len(df.index) == len(neutralizers.index), "Indices don't match"
252
+ assert (df.index == neutralizers.index).all(), "Indices don't match"
253
+ df[df.columns[df.std() == 0]] = np.nan
254
+ df_arr = df.values
255
+ neutralizer_arr = neutralizers.values
256
+ neutralizer_arr = np.hstack(
257
+ # add a column of 1s to the neutralizer array in case neutralizer_arr is a single column
258
+ (neutralizer_arr, np.array([1] * len(neutralizer_arr)).reshape(-1, 1))
259
+ )
260
+ least_squares = np.linalg.lstsq(neutralizer_arr, df_arr, rcond=1e-6)[0]
261
+ adjustments = proportion * neutralizer_arr.dot(least_squares)
262
+ neutral = df_arr - adjustments
263
+ return pd.DataFrame(neutral, index=df.index, columns=df.columns)