numerai-tools 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,228 @@
1
+ from typing import List, Tuple, Union
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from scipy import stats
6
+ from sklearn.preprocessing import OneHotEncoder
7
+
8
+
9
+ # this is primarily used b/c round 326 had too many stocks,
10
+ # so we need to filter out the unnecessary ids here just in case
11
+ # it's also just convenient way to ensure everything is sorted/matching
12
+ def filter_sort_index(
13
+ s1: Union[pd.DataFrame, pd.Series],
14
+ s2: Union[pd.DataFrame, pd.Series],
15
+ max_filtered_ratio: float = 0.2,
16
+ ) -> Tuple[pd.DataFrame, pd.DataFrame]:
17
+ ids = s1.dropna().index.intersection(s2.dropna().index)
18
+ # ensure we didn't filter too many ids
19
+ assert len(ids) / len(s1) >= (1 - max_filtered_ratio)
20
+ assert len(ids) / len(s2) >= (1 - max_filtered_ratio)
21
+ return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
22
+
23
+
24
+ def rank(df: pd.DataFrame, method: str = 'average') -> pd.DataFrame:
25
+ """Percentile rank each column of a pandas DataFrame, centering values around 0.5
26
+
27
+ Arguments:
28
+ df: pd.DataFrame - the data to rank
29
+ method: str - the pandas ranking method to use, options:
30
+ 'average' (default) - keeps ties
31
+ 'first' - breaks ties by index
32
+
33
+ Returns:
34
+ pd.DataFrame - the ranked DataFrame
35
+ """
36
+ assert np.array_equal(df.index.sort_values(), df.index), "unsorted index found"
37
+ return df.apply(
38
+ lambda series: (series.rank(method=method).values - 0.5) / series.count()
39
+ )
40
+
41
+
42
+ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
43
+ # rank columns, breaking ties by index
44
+ return rank(df, "first")
45
+
46
+
47
+ def tie_kept_rank(df: pd.DataFrame) -> pd.DataFrame:
48
+ # rank columns, but keep ties
49
+ return rank(df, "average")
50
+
51
+
52
+ def min_max_normalize(s: pd.Series) -> pd.Series:
53
+ # scale a series to be between 0 and 1
54
+ return (s - s.min()) / (s.max() - s.min())
55
+
56
+
57
+ def validate_indices(live_targets: pd.Series, predictions: pd.Series) -> None:
58
+ # ensure the ids are equivalent and sorted
59
+ assert np.array_equal(predictions.index, live_targets.index.sort_values())
60
+ assert np.array_equal(live_targets.index, live_targets.index.sort_values())
61
+ assert np.array_equal(predictions.index, predictions.index.sort_values())
62
+ # ensure no nans
63
+ assert not predictions.isna().any()
64
+ assert not live_targets.isna().any()
65
+
66
+
67
+ def correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
68
+ validate_indices(live_targets, predictions)
69
+ # calculate correlation coefficient
70
+ return np.corrcoef(live_targets, predictions)[0, 1]
71
+
72
+
73
+ def tie_broken_rank_correlation(
74
+ live_targets: pd.Series, predictions: pd.Series
75
+ ) -> float:
76
+ # percentile rank the predictions and get the correlation with live_targets
77
+ ranked_predictions = tie_broken_rank(predictions.to_frame())[predictions.name]
78
+ return correlation(live_targets, ranked_predictions)
79
+
80
+
81
+ def spearman_correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
82
+ validate_indices(live_targets, predictions)
83
+ # calculate corr
84
+ return live_targets.corr(predictions, method="spearman")
85
+
86
+
87
+ def pearson_correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
88
+ validate_indices(live_targets, predictions)
89
+ # calculate corr
90
+ return live_targets.corr(predictions, method="pearson")
91
+
92
+
93
+ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
94
+ """Raise given predictions series to the given power.
95
+
96
+ Arguments:
97
+ df: pd.DataFrame - the data to raise to the given power
98
+ p: float - the power to which we exponentiate the data
99
+
100
+ Returns:
101
+ pd.DataFrame - the predictions raised to the given power,
102
+ each column should be at least 90% correlated with the original data
103
+ """
104
+ assert not df.isna().any().any(), "Data contains NaNs"
105
+ assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
106
+ result = np.sign(df) * np.abs(df) ** p
107
+ assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
108
+ return result
109
+
110
+
111
+ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
112
+ """Gaussianize each column of a pandas DataFrame using a normal percent point func
113
+
114
+ Arguments:
115
+ df: pd.DataFrame - the data to gaussianize
116
+
117
+ Returns:
118
+ pd.DataFrame - the gaussianized data
119
+ """
120
+ assert np.array_equal(df.index.sort_values(), df.index)
121
+ return df.apply(lambda series: stats.norm.ppf(series))
122
+
123
+
124
+ def neutralize(
125
+ df: pd.DataFrame, neutralizers: np.ndarray, proportion: float = 1.0
126
+ ) -> pd.DataFrame:
127
+ """Neutralize each column of a given DataFrame by each feature in a given
128
+ neutralizers DataFrame.
129
+
130
+ Arguments:
131
+ df: pd.DataFrame - the data with columns to neutralize
132
+ neutralizers: pd.DataFrame - the neutralizer data with features as columns
133
+ proportion: float - the degree to which neutralization occurs
134
+
135
+ Returns:
136
+ pd.DataFrame - the neutralized data
137
+
138
+ """
139
+ assert not neutralizers.isna().any().any(), "Neutralizers contain NaNs"
140
+ assert len(df.index) == len(neutralizers.index), "Indices don't match"
141
+ assert (df.index == neutralizers.index).all(), "Indices don't match"
142
+ df[df.columns[df.std() == 0]] = np.nan
143
+ df_arr = df.values
144
+ neutralizer_arr = neutralizers.values
145
+ inverse_neutralizers = np.linalg.pinv(neutralizer_arr, rcond=1e-6)
146
+ adjustments = proportion * neutralizer_arr.dot(inverse_neutralizers.dot(df_arr))
147
+ neutral = df_arr - adjustments
148
+ neutral /= np.std(neutral, axis=0)
149
+ return pd.DataFrame(neutral, index=df.index, columns=df.columns)
150
+
151
+
152
+ def one_hot_encode(
153
+ df: pd.DataFrame, columns: List[str], dtype: type = np.float64
154
+ ) -> pd.DataFrame:
155
+ """One-hot encodes specified columns in a pandas dataframe.
156
+ Each column i should have x_i discrete values (eg. categories, bucket values, etc.)
157
+ and will be converted to x_i columns that each have 0s for rows that don't have
158
+ the associated value and 1s for rows that do have that value.
159
+
160
+ Arguments:
161
+ df: pd.DataFrame - the data with columns to one-hot encode
162
+ columns: List[str] - list of columns names to replace w/ one-hot encoding
163
+ dtype: type = np.float64 - the target datatype for the resulting columns
164
+
165
+ Returns:
166
+ pd.DataFrame - original data, but specified cols replaced w/ one-hot encoding
167
+
168
+ """
169
+ for col in columns:
170
+ encoder = OneHotEncoder(dtype=dtype)
171
+ one_hot = encoder.fit_transform(df[[col]])
172
+ one_hot = pd.DataFrame(
173
+ one_hot.toarray(),
174
+ columns=encoder.get_feature_names(),
175
+ index=df.index,
176
+ )
177
+ df = df.join(one_hot).drop(columns=col)
178
+ return df
179
+
180
+
181
+ def tie_kept_rank__gaussianize__pow_1_5(df: pd.DataFrame) -> pd.DataFrame:
182
+ """Perform the 3 functions in order on the given pandas DataFrame.
183
+ Will tie-kept rank then gaussianize then exponentiate to the 1.5 power.
184
+
185
+ Arguments:
186
+ df: pd.DataFrame - the data to transform
187
+
188
+ Returns:
189
+ pd.DataFrame - the resulting data after applying the 3 functions
190
+
191
+ """
192
+ return power(gaussian(tie_kept_rank(df)), 1.5)
193
+
194
+
195
+ def tie_kept_rank__gaussianize__neutralize(
196
+ df: pd.DataFrame, neutralizers: pd.DataFrame
197
+ ) -> pd.DataFrame:
198
+ """Perform the 3 functions in order on the given pandas DataFrame.
199
+ Will tie-kept rank then gaussianize then neutralize the df to the neutralizers.
200
+
201
+ Arguments:
202
+ df: pd.DataFrame - the data to transform
203
+
204
+ Returns:
205
+ pd.DataFrame - the resulting data after applying the 3 functions
206
+ """
207
+ return neutralize(gaussian(tie_kept_rank(df)), neutralizers)
208
+
209
+
210
+ def numerai_corr(predictions: pd.DataFrame, targets: pd.Series) -> pd.Series:
211
+ """Recenter the target on 0, filter and sort indices, apply tie_kept_rank__gaussianize__pow_1_5
212
+ to the predictions, raise the targets to the 1.5 power, then calculate the
213
+ pearson correlation between the predictions and targets.
214
+
215
+ Arguments:
216
+ predictions: pd.DataFrame - the predictions to evaluate
217
+ targets: pd.Series - the live targets to evaluate against
218
+
219
+ Returns:
220
+ pd.Series - the resulting correlation scores for each column in predictions
221
+
222
+ """
223
+ targets -= targets.mean()
224
+ targets, predictions = filter_sort_index(targets, predictions)
225
+ predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
226
+ targets = power(targets.to_frame(), 1.5)[targets.name]
227
+ scores = predictions.apply(lambda sub: pearson_correlation(targets, sub))
228
+ return scores
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Numerai
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ Metadata-Version: 2.1
2
+ Name: numerai-tools
3
+ Version: 0.0.1
4
+ Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
+ Home-page: https://github.com/numerai/numerai-tools
6
+ Maintainer: Numerai
7
+ Maintainer-email: support@numer.ai
8
+ License: MIT License
9
+ Platform: OS Independent
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Environment :: Console
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Topic :: Scientific/Engineering
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: pandas (==1.2.4)
20
+ Requires-Dist: numpy (==1.20.3)
21
+ Requires-Dist: scipy (==1.2.1)
22
+ Requires-Dist: sklearn (==0.0)
23
+
24
+ # numerai-tools
25
+ A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
26
+
27
+
@@ -0,0 +1,7 @@
1
+ numerai_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ numerai_tools/scoring.py,sha256=8JtVW7PmdufqBdb7jQuqoQkZLjf-iqwwb8EzCF1MA4g,8421
3
+ numerai_tools-0.0.1.dist-info/LICENSE,sha256=5GaPaG8D6JPMIFh32ux8ZcURhf9mUgjcMxriwBgPdZY,1064
4
+ numerai_tools-0.0.1.dist-info/METADATA,sha256=Gzm2PPrcd9DJ_vfpKWg6TP3bd2EmexoKvLyA4oqrXcM,984
5
+ numerai_tools-0.0.1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
6
+ numerai_tools-0.0.1.dist-info/top_level.txt,sha256=QKdtgNVARMZdDuypEmRzAhai8XPzCHfK_xH4nQZ46gU,14
7
+ numerai_tools-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.40.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ numerai_tools