numerai-tools 0.5.0.dev1__tar.gz → 0.5.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,11 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev1
3
+ Version: 0.5.0.dev2
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
- Home-page: https://github.com/numerai/numerai-tools
6
- Maintainer: Numerai
7
- Maintainer-email: support@numer.ai
8
- License: MIT License
9
- Platform: OS Independent
5
+ License: MIT
6
+ Author: Numerai Engineering
7
+ Author-email: engineering@numer.ai
8
+ Requires-Python: >=3.11
10
9
  Classifier: Development Status :: 5 - Production/Stable
11
10
  Classifier: Environment :: Console
12
11
  Classifier: Intended Audience :: Science/Research
@@ -15,8 +14,15 @@ Classifier: Operating System :: OS Independent
15
14
  Classifier: Programming Language :: Python
16
15
  Classifier: Programming Language :: Python :: 3
17
16
  Classifier: Topic :: Scientific/Engineering
17
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
18
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
19
+ Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
20
+ Requires-Dist: scipy (>=1.13.0,<2.0.0)
21
+ Project-URL: Documentation, https://docs.numer.ai/
22
+ Project-URL: Homepage, https://numer.ai
23
+ Project-URL: Repository, https://github.com/numerai/numerai-tools
18
24
  Description-Content-Type: text/markdown
19
- License-File: LICENSE
20
25
 
21
26
  # numerai-tools
22
27
  A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
28
+
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "numerai-tools"
3
+ version = "0.5.0.dev2"
4
+ description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
+ authors = [
6
+ {name = "Numerai Engineering",email = "engineering@numer.ai"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ classifiers = [
12
+ "Development Status :: 5 - Production/Stable",
13
+ "Environment :: Console",
14
+ "Intended Audience :: Science/Research",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Scientific/Engineering",
20
+ ]
21
+
22
+ [project.urls]
23
+ homepage = "https://numer.ai"
24
+ repository = "https://github.com/numerai/numerai-tools"
25
+ documentation = "https://docs.numer.ai/"
26
+
27
+ [tool.poetry]
28
+ packages = [
29
+ {include = "numerai_tools", from = "."},
30
+ ]
31
+
32
+ [tool.poetry.dependencies]
33
+ pandas = "^2.2.2"
34
+ numpy = "^2.0.0"
35
+ scipy = "^1.13.0"
36
+ scikit-learn = "^1.5.0"
37
+
38
+ [tool.poetry.group.dev.dependencies]
39
+ pytest = "^8.3.4"
40
+ mypy = "^1.15.0"
41
+ ruff = "^0.5.4"
42
+
43
+ [build-system]
44
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
45
+ build-backend = "poetry.core.masonry.api"
@@ -1,22 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: numerai_tools
3
- Version: 0.5.0.dev1
4
- Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
- Home-page: https://github.com/numerai/numerai-tools
6
- Maintainer: Numerai
7
- Maintainer-email: support@numer.ai
8
- License: MIT License
9
- Platform: OS Independent
10
- Classifier: Development Status :: 5 - Production/Stable
11
- Classifier: Environment :: Console
12
- Classifier: Intended Audience :: Science/Research
13
- Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Operating System :: OS Independent
15
- Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3
17
- Classifier: Topic :: Scientific/Engineering
18
- Description-Content-Type: text/markdown
19
- License-File: LICENSE
20
-
21
- # numerai-tools
22
- A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
@@ -1,16 +0,0 @@
1
- LICENSE
2
- README.md
3
- setup.py
4
- numerai_tools/__init__.py
5
- numerai_tools/py.typed
6
- numerai_tools/scoring.py
7
- numerai_tools/signals.py
8
- numerai_tools/submissions.py
9
- numerai_tools.egg-info/PKG-INFO
10
- numerai_tools.egg-info/SOURCES.txt
11
- numerai_tools.egg-info/dependency_links.txt
12
- numerai_tools.egg-info/requires.txt
13
- numerai_tools.egg-info/top_level.txt
14
- tests/test_scoring.py
15
- tests/test_signals.py
16
- tests/test_submissions.py
@@ -1,4 +0,0 @@
1
- pandas<3.0.0,>=2.2.2
2
- numpy<3.0.0,>=2.0.0
3
- scipy<2.0.0,>=1.13.0
4
- scikit-learn<2.0.0,>=1.5.0
@@ -1 +0,0 @@
1
- numerai_tools
@@ -1,4 +0,0 @@
1
- [egg_info]
2
- tag_build =
3
- tag_date = 0
4
-
@@ -1,47 +0,0 @@
1
- from setuptools import setup
2
- from setuptools import find_packages
3
-
4
- VERSION = "0.5.0.dev1"
5
-
6
-
7
- def load(path):
8
- return open(path, "r").read()
9
-
10
-
11
- classifiers = [
12
- "Development Status :: 5 - Production/Stable",
13
- "Environment :: Console",
14
- "Intended Audience :: Science/Research",
15
- "License :: OSI Approved :: MIT License",
16
- "Operating System :: OS Independent",
17
- "Programming Language :: Python",
18
- "Programming Language :: Python :: 3",
19
- "Topic :: Scientific/Engineering",
20
- ]
21
-
22
-
23
- if __name__ == "__main__":
24
- setup(
25
- name="numerai_tools",
26
- version=VERSION,
27
- maintainer="Numerai",
28
- maintainer_email="support@numer.ai",
29
- description="A collection of open-source tools to help interact with Numerai, model data, and automate submissions.",
30
- long_description=load("README.md"),
31
- long_description_content_type="text/markdown",
32
- url="https://github.com/numerai/numerai-tools",
33
- platforms="OS Independent",
34
- classifiers=classifiers,
35
- license="MIT License",
36
- package_data={
37
- "numerai_tools": ["LICENSE", "README.md", "py.typed"],
38
- },
39
- packages=find_packages(exclude=["tests"]),
40
- install_requires=[
41
- # pandas 2.2.2 was the first version to support numpy 2
42
- "pandas>=2.2.2,<3.0.0",
43
- "numpy>=2.0.0,<3.0.0",
44
- "scipy>=1.13.0,<2.0.0",
45
- "scikit-learn>=1.5.0,<2.0.0",
46
- ],
47
- )
@@ -1,346 +0,0 @@
1
- import unittest
2
-
3
- import numpy as np
4
- import pandas as pd # type: ignore
5
-
6
- from numerai_tools.scoring import (
7
- correlation,
8
- numerai_corr,
9
- tie_broken_rank_correlation,
10
- spearman_correlation,
11
- pearson_correlation,
12
- tie_broken_rank,
13
- tie_kept_rank,
14
- gaussian,
15
- neutralize,
16
- one_hot_encode,
17
- power,
18
- tie_kept_rank__gaussianize__pow_1_5,
19
- variance_normalize,
20
- orthogonalize,
21
- stake_weight,
22
- filter_sort_index,
23
- filter_sort_index_many,
24
- filter_sort_top_bottom,
25
- filter_sort_top_bottom_concat,
26
- alpha,
27
- meta_portfolio_contribution,
28
- )
29
-
30
-
31
- class TestScoring(unittest.TestCase):
32
- def setUp(self):
33
- self.up = pd.Series(list(range(5))).rename("up")
34
- self.down = pd.Series(list(reversed(range(5)))).rename("down")
35
- self.up_down = pd.Series([1, 0, 1, 0, 1]).rename("up_down")
36
- self.down_up = (1 - self.up_down).rename("down_up")
37
- self.up_float = (self.up / self.up.max()).rename("up_float")
38
- self.pos_neg = pd.Series([0, -0, 0.5, -0.5, 1.0, -1.0, 2.0, -2.0]).rename(
39
- "pos_neg"
40
- )
41
-
42
- def test_filter_sort_index(self):
43
- # Test with 2 simple ranges with different indices
44
- s = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4])
45
- t = pd.Series([1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])
46
- new_s, new_t = filter_sort_index(s, t)
47
- self.assertEqual(len(new_s), 4)
48
- self.assertEqual(len(new_t), 4)
49
- self.assertTrue(np.array_equal(new_s.index, [1, 2, 3, 4]))
50
- self.assertTrue(np.array_equal(new_t.index, [1, 2, 3, 4]))
51
- self.assertTrue(np.array_equal(new_s.values, [2, 3, 4, 5]))
52
- self.assertTrue(np.array_equal(new_t.values, [1, 2, 3, 4]))
53
-
54
- def test_filter_sort_index_invalid(self):
55
- # Ensure assertion error when max filtered ratio is exceeded
56
- s = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4])
57
- t = pd.Series([1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])
58
- with self.assertRaises(AssertionError):
59
- filter_sort_index(s, t, max_filtered_ratio=0.1)
60
-
61
- def test_filter_sort_index_many(self):
62
- # Test with a DataFrame
63
- s = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4])
64
- t = pd.Series([1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])
65
- new_s, new_t = filter_sort_index_many([s, t])
66
- self.assertEqual(len(new_s), 4)
67
- self.assertEqual(len(new_t), 4)
68
- self.assertTrue(np.array_equal(new_s.index, [1, 2, 3, 4]))
69
- self.assertTrue(np.array_equal(new_t.index, [1, 2, 3, 4]))
70
- self.assertTrue(np.array_equal(new_s.values, [2, 3, 4, 5]))
71
- self.assertTrue(np.array_equal(new_t.values, [1, 2, 3, 4]))
72
-
73
- def test_filter_sort_index_many_invalid(self):
74
- # Ensure assertion error when max filtered ratio is exceeded
75
- s = pd.Series([1, 2, 3, 4, 5], index=[0, 1, 2, 3, 4])
76
- t = pd.Series([1, 2, 3, 4, 5], index=[1, 2, 3, 4, 5])
77
- with self.assertRaises(AssertionError):
78
- filter_sort_index_many([s, t], max_filtered_ratio=0.1)
79
-
80
- def test_correlation(self):
81
- assert np.isclose(correlation(self.up, self.up), 1)
82
- assert np.isclose(correlation(self.up, self.down), -1)
83
- assert np.isclose(correlation(self.up, self.up_down), 0)
84
- assert np.isclose(correlation(self.up, self.down_up), 0)
85
-
86
- def test_tie_broken_rank_correlation(self):
87
- assert np.isclose(tie_broken_rank_correlation(self.up, self.up), 1)
88
- assert np.isclose(tie_broken_rank_correlation(self.up, self.down), -1)
89
- # tie_broken_rank_correlation ranks the submission not the targets
90
- assert np.isclose(tie_broken_rank_correlation(self.up, self.up_down), 0.5)
91
- assert np.isclose(tie_broken_rank_correlation(self.up, self.down_up), 0.5)
92
- assert np.isclose(tie_broken_rank_correlation(self.up_down, self.up), 0)
93
- assert np.isclose(tie_broken_rank_correlation(self.down_up, self.up), 0)
94
-
95
- def test_spearman_correlation(self):
96
- assert np.isclose(spearman_correlation(self.up, self.up), 1)
97
- assert np.isclose(spearman_correlation(self.up, self.down), -1)
98
- assert np.isclose(spearman_correlation(self.up, self.up_down), 0)
99
- assert np.isclose(spearman_correlation(self.up, self.down_up), 0)
100
- assert np.isclose(spearman_correlation(self.up_down, self.up), 0)
101
- assert np.isclose(spearman_correlation(self.down_up, self.up), 0)
102
-
103
- def test_pearson_correlation(self):
104
- assert np.isclose(pearson_correlation(self.up, self.up), 1)
105
- assert np.isclose(pearson_correlation(self.up, self.down), -1)
106
- assert np.isclose(pearson_correlation(self.up, self.up_down), 0)
107
- assert np.isclose(pearson_correlation(self.up, self.down_up), 0)
108
- assert np.isclose(pearson_correlation(self.up_down, self.up), 0)
109
- assert np.isclose(pearson_correlation(self.down_up, self.up), 0)
110
-
111
- def test_tie_broken_rank(self):
112
- assert np.isclose(
113
- tie_broken_rank(self.up.to_frame()).T, [0.1, 0.3, 0.5, 0.7, 0.9]
114
- ).all()
115
- assert np.isclose(
116
- tie_broken_rank(self.up_down.to_frame()).T, [0.5, 0.1, 0.7, 0.3, 0.9]
117
- ).all()
118
-
119
- def test_tie_kept_rank(self):
120
- assert np.isclose(
121
- tie_kept_rank(self.up.to_frame()).T, [0.1, 0.3, 0.5, 0.7, 0.9]
122
- ).all()
123
- assert np.isclose(
124
- tie_kept_rank(self.up_down.to_frame()).T, [0.7, 0.2, 0.7, 0.2, 0.7]
125
- ).all()
126
-
127
- def test_gaussian(self):
128
- assert np.isclose(
129
- gaussian(self.up_float).values.T,
130
- [-np.inf, -0.6744897501960817, 0, 0.6744897501960817, np.inf],
131
- ).all()
132
-
133
- def test_variance_normalize(self):
134
- assert np.isclose(
135
- variance_normalize(self.up_float).values.T,
136
- [
137
- 0.0,
138
- 0.7071067811865475,
139
- 1.414213562373095,
140
- 2.1213203435596424,
141
- 2.82842712474619,
142
- ],
143
- ).all()
144
-
145
- def test_one_hot_encode(self):
146
- assert np.isclose(
147
- one_hot_encode(self.up.to_frame(), ["up"]).values.T,
148
- [
149
- [1.0, 0.0, 0.0, 0.0, 0.0],
150
- [0.0, 1.0, 0.0, 0.0, 0.0],
151
- [0.0, 0.0, 1.0, 0.0, 0.0],
152
- [0.0, 0.0, 0.0, 1.0, 0.0],
153
- [0.0, 0.0, 0.0, 0.0, 1.0],
154
- ],
155
- ).all()
156
-
157
- def test_power(self):
158
- assert np.isclose(
159
- power(self.pos_neg.to_frame(), 1.5),
160
- [
161
- [0.0],
162
- [0.0],
163
- [0.3535533905932738],
164
- [-0.3535533905932738],
165
- [1.0000000000000000],
166
- [-1.0000000000000000],
167
- [2.8284271247461903],
168
- [-2.8284271247461903],
169
- ],
170
- ).all()
171
-
172
- def test_tie_kept_rank__gaussianize__pow_1_5(self):
173
- assert np.isclose(
174
- tie_kept_rank__gaussianize__pow_1_5(self.up_float.to_frame()),
175
- [
176
- [-1.4507885796854221],
177
- [-0.3797472709071263],
178
- [0.0000000000000000],
179
- [0.3797472709071261],
180
- [1.4507885796854221],
181
- ],
182
- ).all()
183
-
184
- def test_orthoganalize(self):
185
- assert np.isclose(
186
- orthogonalize(self.up.to_frame().values, self.up.to_frame().values),
187
- [0, 0, 0, 0, 0],
188
- ).all()
189
- assert np.isclose(
190
- orthogonalize(self.up.to_frame().values, self.up_down.to_frame().values),
191
- [[-2], [1], [0], [3], [2]],
192
- ).all()
193
- assert np.isclose(
194
- orthogonalize(
195
- self.down_up.to_frame().values, self.up_down.to_frame().values
196
- ),
197
- [[0], [1], [0], [1], [0]],
198
- ).all()
199
-
200
- def test_stake_weight(self):
201
- assert np.isclose(
202
- stake_weight(self.up.to_frame(), pd.Series([1], index=[self.up.name])),
203
- self.up.values.T,
204
- ).all()
205
- assert np.isclose(
206
- stake_weight(
207
- pd.concat([self.up, self.down], axis=1),
208
- pd.Series([1, 1], index=[self.up.name, self.down.name]),
209
- ),
210
- ((self.up + self.down) / 2).values.T,
211
- ).all()
212
-
213
- def test_neutralize_basic(self):
214
- assert np.isclose(
215
- neutralize(self.up.to_frame(), pd.DataFrame([0, 0, 0, 0, 0])).values.T,
216
- self.up - self.up.mean(),
217
- ).all()
218
-
219
- def test_neutralize_multiple_subs(self):
220
- assert np.isclose(
221
- neutralize(self.up_down.to_frame(), self.down_up.to_frame()).values.T,
222
- [0, 0, 0, 0, 0],
223
- ).all()
224
-
225
- def test_neutralize_multiple_subs_multiple_neutralizers(self):
226
- # ensure it works for multiple submissions/neutralizers
227
- assert np.isclose(
228
- neutralize(
229
- pd.concat([self.up_down, self.up_down], axis=1),
230
- pd.concat([self.down_up, self.down_up], axis=1),
231
- ).values.T,
232
- [
233
- [0, 0, 0, 0, 0],
234
- [0, 0, 0, 0, 0],
235
- ],
236
- ).all()
237
- assert np.isclose(
238
- neutralize(
239
- pd.concat([self.up, self.down], axis=1),
240
- pd.concat(
241
- [pd.Series([0, 0, 0, 0, 0]), pd.Series([0, 0, 0, 0, 0])], axis=1
242
- ),
243
- ).values.T,
244
- pd.concat(
245
- [self.up - self.up.mean(), self.down - self.down.mean()], axis=1
246
- ).values.T,
247
- ).all()
248
-
249
- def test_neutralize_proportion(self):
250
- # Test with proportion less than 1
251
- assert np.isclose(
252
- neutralize(
253
- self.up.to_frame(), pd.DataFrame([0, 0, 0, 0, 0]), proportion=0.5
254
- ).values.T,
255
- (self.up - self.up.mean() * 0.5),
256
- ).all()
257
-
258
- # Test with proportion equal to 0
259
- assert np.isclose(
260
- neutralize(
261
- self.up.to_frame(), pd.DataFrame([0, 0, 0, 0, 0]), proportion=0
262
- ).values.T,
263
- self.up,
264
- ).all()
265
-
266
- def test_neutralize_with_nans(self):
267
- # Test with NaNs in input data
268
- up_with_nans = self.up.copy()
269
- up_with_nans[2] = np.nan
270
- self.assertRaisesRegex(
271
- AssertionError,
272
- "Data contains NaNs",
273
- neutralize,
274
- up_with_nans.to_frame(),
275
- pd.DataFrame([0, 0, 0, 0, 0]),
276
- )
277
-
278
- def test_neutralize_large_data(self):
279
- # Test with larger dataset
280
- large_data = pd.DataFrame(np.random.randn(1000, 10))
281
- neutralizers = pd.DataFrame(np.random.randn(1000, 5))
282
- neutralized = neutralize(large_data, neutralizers)
283
- assert neutralized.shape == large_data.shape
284
- assert not np.isnan(neutralized).any().any()
285
-
286
- def test_numerai_corr_doesnt_clobber_targets(self):
287
- s = [x / 4 for x in range(5)]
288
- df = pd.DataFrame({"target": s, "prediction": reversed(s)})
289
- numerai_corr(df[["prediction"]], df["target"])
290
- assert pd.Series(s).equals(df["target"]), f"{s} != {list(df['target'].values)}"
291
-
292
- def test_filter_top_bottom(self):
293
- self.assertRaises(
294
- TypeError,
295
- filter_sort_top_bottom,
296
- self.up,
297
- top_bottom=None,
298
- )
299
- np.testing.assert_allclose(
300
- filter_sort_top_bottom_concat(self.up, top_bottom=2),
301
- [0, 1, 3, 4],
302
- )
303
- top, bot = filter_sort_top_bottom(
304
- self.up,
305
- top_bottom=2,
306
- )
307
- np.testing.assert_allclose(top, [3, 4])
308
- np.testing.assert_allclose(bot, [0, 1])
309
-
310
- def test_alpha(self):
311
- s = pd.DataFrame([[1, 2, 3, 4, 5]]).T
312
- N = pd.DataFrame(
313
- [
314
- [1, 5],
315
- [2, 4],
316
- [3, 3],
317
- [4, 2],
318
- [5, 1],
319
- ]
320
- )
321
- v = pd.Series([1, 0.5, 1, 0.5, 1]).T
322
- t = pd.Series([1, 0, 1, 0, 1]).T
323
- score = alpha(s, N, v, t)
324
- np.testing.assert_allclose(score, 0.0, atol=1e-14, rtol=1e-14)
325
-
326
- def test_meta_portfolio_contribution(self):
327
- s = pd.DataFrame([[1, 2, 3, 4, 5], [1, 2, 1, 2, 1]]).T
328
- st = pd.Series([0.6, 0.4])
329
- N = pd.DataFrame(
330
- [
331
- [1, 5],
332
- [2, 4],
333
- [3, 3],
334
- [4, 2],
335
- [5, 1],
336
- ]
337
- )
338
- v = pd.Series([3, 2, 1, 2, 3]).T
339
- t = pd.Series([1.0, 2.0, 3.0, 2.0, 1.0]).T
340
- score = meta_portfolio_contribution(s, st, N, v, t)
341
- assert np.isclose(score[0], -0.04329786867021718)
342
- assert np.isclose(score[1], 0.06494680300532589)
343
-
344
-
345
- if __name__ == "__main__":
346
- unittest.main()
@@ -1,139 +0,0 @@
1
- import unittest
2
-
3
- import numpy as np
4
- import pandas as pd # type: ignore
5
-
6
- from numerai_tools.signals import (
7
- churn,
8
- turnover,
9
- calculate_max_churn_and_turnover,
10
- )
11
- from .util import (
12
- generate_fake_universe,
13
- generate_new_submission,
14
- )
15
-
16
-
17
- class TestSignals(unittest.TestCase):
18
- def setUp(self):
19
- self.up = pd.Series(list(range(5))).rename("up")
20
- self.down = pd.Series(list(reversed(range(5)))).rename("down")
21
- self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
22
- self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
23
- self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
24
-
25
- def test_churn(self):
26
- assert np.isclose(churn(self.up, self.up), 0)
27
- assert np.isclose(churn(self.up, self.up_down), 1)
28
- assert np.isclose(churn(self.up, self.oscillate), 1)
29
- assert np.isclose(churn(self.up, self.down), 2)
30
- self.assertRaisesRegex(
31
- AssertionError,
32
- "s2 must have non-zero standard deviation",
33
- churn,
34
- self.up,
35
- self.constant,
36
- )
37
-
38
- def test_churn_tb(self):
39
- tmp = churn(self.up, self.up, top_bottom=2)
40
- assert np.isclose(tmp, 0), tmp
41
- tmp = churn(self.up, self.up_down, top_bottom=2)
42
- assert np.isclose(tmp, 0.5), tmp
43
- tmp = churn(self.up, self.oscillate, top_bottom=2)
44
- assert np.isclose(tmp, 0.5), tmp
45
- tmp = churn(self.up, self.down, top_bottom=2)
46
- assert np.isclose(tmp, 1), tmp
47
- tmp = churn(self.up, self.constant, top_bottom=2)
48
- assert np.isclose(tmp, 0), tmp
49
-
50
- def test_turnover(self):
51
- assert np.isclose(turnover(self.up, self.up), 0)
52
- assert np.isclose(turnover(self.up, self.up_down), 3)
53
- assert np.isclose(turnover(self.up, self.oscillate), 4.5)
54
- assert np.isclose(turnover(self.up, self.down), 6)
55
- assert np.isclose(turnover(self.up, self.constant), 3.5)
56
-
57
- def test_churn_first_submission(self):
58
- """
59
- Test that the churn function works for the first submission
60
- No exceptions should be raised, should return 1
61
- """
62
- fake_universe = generate_fake_universe("20130308")
63
- fake_submission = generate_new_submission(fake_universe)
64
- fake_neutralizers = pd.DataFrame(
65
- {
66
- "neutralizer_1": [0.1] * len(fake_universe),
67
- "neutralizer_2": [0.2] * len(fake_universe),
68
- },
69
- index=fake_universe["numerai_ticker"],
70
- )
71
- fake_sample_weights = pd.Series(
72
- [0.5] * len(fake_universe),
73
- index=fake_universe["numerai_ticker"],
74
- name="sample_weight",
75
- )
76
- churn, turnover = calculate_max_churn_and_turnover(
77
- curr_sub=fake_submission,
78
- curr_neutralizer=fake_neutralizers,
79
- curr_weight=fake_sample_weights,
80
- prev_week_subs=[],
81
- prev_neutralizers={"20240208": fake_neutralizers},
82
- prev_sample_weights={"20240208": fake_sample_weights},
83
- universe=fake_universe.set_index("numerai_ticker").sort_index(),
84
- curr_signal_col="signal",
85
- curr_ticker_col="numerai_ticker",
86
- )
87
- assert np.isclose(churn, 1)
88
- assert np.isclose(turnover, 1)
89
-
90
- def test_churn_handles_different_id_columns(self):
91
- """
92
- Test that the churn function works when
93
- previous submission has different id columns.
94
- """
95
- fake_universe = generate_fake_universe("20130308")
96
- fake_submission = generate_new_submission(fake_universe, legacy_headers=True)
97
- new_fake_universe = generate_fake_universe(
98
- date_value="20130308", ticker_col="ticker"
99
- )
100
- fake_universe["ticker"] = new_fake_universe["ticker"]
101
- prev_submission = fake_submission.copy()
102
- fake_neutralizers = pd.DataFrame(
103
- {
104
- "neutralizer_1": [0.1] * len(fake_universe),
105
- "neutralizer_2": [0.2] * len(fake_universe),
106
- },
107
- index=fake_universe["numerai_ticker"],
108
- )
109
- fake_sample_weights = pd.Series(
110
- [0.5] * len(fake_universe),
111
- index=fake_universe["numerai_ticker"],
112
- name="sample_weight",
113
- )
114
- # switch out the numerai_ticke col in-place
115
- prev_submission["numerai_ticker"] = new_fake_universe["ticker"]
116
- prev_submission.rename(columns={"numerai_ticker": "ticker"}, inplace=True)
117
- prev_neutralizers = fake_neutralizers.copy()
118
- prev_neutralizers.index = new_fake_universe["ticker"]
119
- prev_neutralizers.index.name = "ticker"
120
- prev_sample_weights = fake_sample_weights.copy()
121
- prev_sample_weights.index = new_fake_universe["ticker"]
122
- prev_sample_weights.index.name = "ticker"
123
- churn, turnover = calculate_max_churn_and_turnover(
124
- curr_sub=fake_submission,
125
- curr_neutralizer=fake_neutralizers,
126
- curr_weight=fake_sample_weights,
127
- prev_week_subs={"20240208": prev_submission},
128
- prev_neutralizers={"20240208": prev_neutralizers},
129
- prev_sample_weights={"20240208": prev_sample_weights},
130
- universe=fake_universe.set_index("numerai_ticker").sort_index(),
131
- curr_signal_col="signal",
132
- curr_ticker_col="numerai_ticker",
133
- )
134
- assert np.isclose(churn, 0)
135
- assert np.isclose(turnover, 0)
136
-
137
-
138
- if __name__ == "__main__":
139
- unittest.main()
@@ -1,498 +0,0 @@
1
- import unittest
2
- import random
3
- import string
4
- from typing import List
5
-
6
- import numpy as np
7
- import pandas as pd # type: ignore
8
-
9
- from numerai_tools.submissions import (
10
- NUMERAI_ALLOWED_ID_COLS,
11
- NUMERAI_ALLOWED_PRED_COLS,
12
- SIGNALS_ALLOWED_ID_COLS,
13
- SIGNALS_ALLOWED_PRED_COLS,
14
- CRYPTO_ALLOWED_ID_COLS,
15
- CRYPTO_ALLOWED_PRED_COLS,
16
- _validate_headers,
17
- validate_headers_numerai,
18
- validate_headers_signals,
19
- validate_headers_crypto,
20
- validate_values,
21
- _validate_ids,
22
- validate_ids_numerai,
23
- validate_ids_signals,
24
- validate_ids_crypto,
25
- clean_predictions,
26
- )
27
-
28
-
29
- class TestSubmissions(unittest.TestCase):
30
- def setUp(self):
31
- # use 9 digits for cusip handling checks
32
- self.ids = generate_ids(9, 5)
33
- self.classic_subs = [
34
- generate_submission(self.ids, id_col, pred_col)
35
- for id_col in NUMERAI_ALLOWED_ID_COLS
36
- for pred_col in NUMERAI_ALLOWED_PRED_COLS
37
- ]
38
- self.signals_subs = [
39
- generate_submission(self.ids, id_col, pred_col)
40
- for id_col in SIGNALS_ALLOWED_ID_COLS
41
- for pred_col in SIGNALS_ALLOWED_PRED_COLS
42
- ]
43
- self.crypto_subs = [
44
- generate_submission(self.ids, id_col, pred_col)
45
- for id_col in CRYPTO_ALLOWED_ID_COLS
46
- for pred_col in CRYPTO_ALLOWED_PRED_COLS
47
- ]
48
-
49
- def test_validate_headers(self):
50
- assert _validate_headers(
51
- ["test1"], ["test2"], generate_submission(self.ids, "test1", "test2")
52
- ) == ("test1", "test2")
53
-
54
- def test_validate_headers_wrong_name(self):
55
- self.assertRaisesRegex(
56
- AssertionError,
57
- "headers must be one of",
58
- _validate_headers,
59
- ["test1"],
60
- ["test2"],
61
- generate_submission(self.ids, "wrong", "test2"),
62
- )
63
- self.assertRaisesRegex(
64
- AssertionError,
65
- "headers must be one of",
66
- _validate_headers,
67
- ["test1"],
68
- ["test2"],
69
- generate_submission(self.ids, "test1", "wrong"),
70
- )
71
-
72
- def test_validate_headers_missing(self):
73
- self.assertRaisesRegex(
74
- AssertionError,
75
- "headers must be one of",
76
- _validate_headers,
77
- ["test1"],
78
- ["test2"],
79
- generate_submission(self.ids, "test1", "test2")[["test1"]],
80
- )
81
- self.assertRaisesRegex(
82
- AssertionError,
83
- "headers must be one of",
84
- _validate_headers,
85
- ["test1"],
86
- ["test2"],
87
- generate_submission(self.ids, "test1", "test2")[["test2"]],
88
- )
89
-
90
- def test_validate_headers_numerai(self):
91
- for sub in self.classic_subs:
92
- assert validate_headers_numerai(sub) == tuple(sub.columns)
93
-
94
- def test_validate_headers_numerai_wrong_name(self):
95
- for sub in self.classic_subs:
96
- self.assertRaisesRegex(
97
- AssertionError,
98
- "headers must be one of",
99
- validate_headers_numerai,
100
- sub.rename(columns={sub.columns[0]: "wrong"}),
101
- )
102
- self.assertRaisesRegex(
103
- AssertionError,
104
- "headers must be one of",
105
- validate_headers_numerai,
106
- sub.rename(columns={sub.columns[1]: "wrong"}),
107
- )
108
-
109
- def test_validate_headers_numerai_missing(self):
110
- for sub in self.classic_subs:
111
- self.assertRaisesRegex(
112
- AssertionError,
113
- "headers must be one of",
114
- validate_headers_numerai,
115
- sub[[sub.columns[0]]],
116
- )
117
- self.assertRaisesRegex(
118
- AssertionError,
119
- "headers must be one of",
120
- validate_headers_numerai,
121
- sub[[sub.columns[1]]],
122
- )
123
-
124
- def test_validate_headers_signals(self):
125
- for sub in self.signals_subs:
126
- assert validate_headers_signals(sub) == tuple(sub.columns)
127
-
128
- def test_validate_headers_signals_wrong_name(self):
129
- for sub in self.signals_subs:
130
- self.assertRaisesRegex(
131
- AssertionError,
132
- "headers must be one of",
133
- validate_headers_signals,
134
- sub.rename(columns={sub.columns[0]: "wrong"}),
135
- )
136
- self.assertRaisesRegex(
137
- AssertionError,
138
- "headers must be one of",
139
- validate_headers_signals,
140
- sub.rename(columns={sub.columns[1]: "wrong"}),
141
- )
142
-
143
- def test_validate_headers_signals_missing(self):
144
- for sub in self.signals_subs:
145
- self.assertRaisesRegex(
146
- AssertionError,
147
- "headers must be one of",
148
- validate_headers_signals,
149
- sub[[sub.columns[0]]],
150
- )
151
- self.assertRaisesRegex(
152
- AssertionError,
153
- "headers must be one of",
154
- validate_headers_signals,
155
- sub[[sub.columns[1]]],
156
- )
157
-
158
- def test_validate_headers_signals_data_type_and_date_col(self):
159
- fake_sub = generate_submission(self.ids, "ticker", "signal")
160
- fake_sub["data_type"] = "signals"
161
- fake_sub["friday_date"] = "2023-01-01"
162
- with self.assertLogs(level="WARNING") as cm:
163
- assert validate_headers_signals(fake_sub) == ("ticker", "signal")
164
- self.assertIn(
165
- "WARNING:numerai_tools.submissions:data_type column found in Signals submission. This is deprecated and will be removed in the future. "
166
- "Please remove the data_type column from your Signals submission.",
167
- cm.output[0],
168
- )
169
-
170
- def test_validate_headers_crypto(self):
171
- for sub in self.crypto_subs:
172
- assert validate_headers_crypto(sub) == tuple(sub.columns)
173
-
174
- def test_validate_headers_crypto_wrong_name(self):
175
- for sub in self.crypto_subs:
176
- self.assertRaisesRegex(
177
- AssertionError,
178
- "headers must be one of",
179
- validate_headers_crypto,
180
- sub.rename(columns={sub.columns[0]: "wrong"}),
181
- )
182
- self.assertRaisesRegex(
183
- AssertionError,
184
- "headers must be one of",
185
- validate_headers_crypto,
186
- sub.rename(columns={sub.columns[1]: "wrong"}),
187
- )
188
-
189
- def test_validate_headers_crypto_missing(self):
190
- for sub in self.crypto_subs:
191
- self.assertRaisesRegex(
192
- AssertionError,
193
- "headers must be one of",
194
- validate_headers_crypto,
195
- sub[[sub.columns[0]]],
196
- )
197
- self.assertRaisesRegex(
198
- AssertionError,
199
- "headers must be one of",
200
- validate_headers_crypto,
201
- sub[[sub.columns[1]]],
202
- )
203
-
204
- def test_validate_values(self):
205
- validate_values(generate_submission(self.ids, "id", "prediction"), "prediction")
206
-
207
- def test_validate_values_nans(self):
208
- nan_sub = generate_submission(self.ids, "id", "prediction")
209
- nan_sub.loc[0, "prediction"] = np.nan
210
- self.assertRaisesRegex(
211
- AssertionError,
212
- "must not contain NaNs",
213
- validate_values,
214
- nan_sub,
215
- "prediction",
216
- )
217
-
218
- def test_validate_values_out_of_bounds(self):
219
- out_of_bounds_sub = generate_submission(self.ids, "id", "prediction")
220
- out_of_bounds_sub.loc[0, "prediction"] = -1
221
- self.assertRaisesRegex(
222
- AssertionError,
223
- "values must be between 0 and 1 exclusive",
224
- validate_values,
225
- out_of_bounds_sub,
226
- "prediction",
227
- )
228
- out_of_bounds_sub.loc[0, "prediction"] = 2
229
- self.assertRaisesRegex(
230
- AssertionError,
231
- "values must be between 0 and 1 exclusive",
232
- validate_values,
233
- out_of_bounds_sub,
234
- "prediction",
235
- )
236
-
237
- def test_validate_values_zero_std(self):
238
- const_sub = generate_submission(self.ids, "id", "prediction")
239
- const_sub["prediction"] = 0.5
240
- self.assertRaisesRegex(
241
- AssertionError,
242
- "submission must have non-zero standard deviation",
243
- validate_values,
244
- const_sub,
245
- "prediction",
246
- )
247
-
248
- def test_validate_ids(self):
249
- sub = generate_submission(self.ids, "id", "prediction")
250
- new_sub, invalid_ids = _validate_ids(self.ids, sub, "id", len(self.ids))
251
- assert (new_sub == sub.sort_values("id")).all().all()
252
- assert invalid_ids == []
253
-
254
- def test_validate_ids_nans(self):
255
- nan_sub = generate_submission(self.ids, "id", "prediction")
256
- nan_sub.loc[0, "id"] = np.nan
257
- self.assertRaisesRegex(
258
- AssertionError,
259
- "must not contain NaNs",
260
- _validate_ids,
261
- self.ids,
262
- nan_sub,
263
- "id",
264
- len(self.ids),
265
- )
266
-
267
- def test_validate_ids_all_nan_ids(self):
268
- nan_ids = pd.Series([np.nan, np.nan, np.nan])
269
- submission = generate_submission(nan_ids, "id", "prediction")
270
- self.assertRaisesRegex(
271
- AssertionError,
272
- "Submission must not contain NaNs",
273
- _validate_ids,
274
- self.ids,
275
- submission,
276
- "id",
277
- len(self.ids),
278
- )
279
-
280
- def test_validate_ids_duplicates(self):
281
- dup_sub = generate_submission(self.ids, "id", "prediction")
282
- dup_sub.loc[0] = dup_sub.loc[1]
283
- self.assertRaisesRegex(
284
- AssertionError,
285
- "Duplicates detected",
286
- _validate_ids,
287
- self.ids,
288
- dup_sub,
289
- "id",
290
- len(self.ids),
291
- )
292
-
293
- def test_validate_ids_duplicate_ids(self):
294
- submission = generate_submission(self.ids, "id", "prediction")
295
- submission = pd.concat([submission, submission.iloc[:1]])
296
- self.assertRaisesRegex(
297
- AssertionError,
298
- "Duplicates detected",
299
- _validate_ids,
300
- self.ids,
301
- submission,
302
- "id",
303
- len(self.ids),
304
- )
305
-
306
- def test_validate_ids_missing(self):
307
- missing_sub = generate_submission(self.ids, "id", "prediction")
308
- missing_sub = missing_sub[missing_sub["id"] != self.ids[0]]
309
- self.assertRaisesRegex(
310
- AssertionError,
311
- "Not enough stocks submitted",
312
- _validate_ids,
313
- self.ids,
314
- missing_sub,
315
- "id",
316
- len(self.ids),
317
- )
318
-
319
- def test_validate_ids_empty_submission(self):
320
- empty_submission = pd.DataFrame(columns=["id", "prediction"])
321
- self.assertRaisesRegex(
322
- AssertionError,
323
- "Not enough stocks submitted.",
324
- _validate_ids,
325
- self.ids,
326
- empty_submission,
327
- "id",
328
- len(self.ids),
329
- )
330
-
331
- def test_validate_ids_all_invalid_ids(self):
332
- invalid_ids = pd.Series(["invalid1", "invalid2", "invalid3"])
333
- submission = generate_submission(invalid_ids, "id", "prediction")
334
- self.assertRaisesRegex(
335
- AssertionError,
336
- "Not enough stocks submitted.",
337
- _validate_ids,
338
- self.ids,
339
- submission,
340
- "id",
341
- len(self.ids),
342
- )
343
-
344
- def test_validate_ids_mixed_valid_invalid_ids(self):
345
- mixed_ids = self.ids.tolist() + ["invalid1", "invalid2"]
346
- submission = generate_submission(mixed_ids, "id", "prediction")
347
- new_sub, invalid_ids = _validate_ids(self.ids, submission, "id", len(self.ids))
348
- assert (new_sub["id"] == self.ids.sort_values()).all()
349
- assert set(invalid_ids) == {"invalid1", "invalid2"}
350
-
351
- def test_validate_ids_numerai(self):
352
- sub = generate_submission(self.ids, "id", "prediction")
353
- new_sub, invalid_ids = validate_ids_numerai(self.ids, sub, "id")
354
- assert (new_sub == sub.sort_values("id")).all().all()
355
- assert invalid_ids == []
356
-
357
- def test_validate_ids_signals(self):
358
- ids = generate_ids(9, 100)
359
- sub = generate_submission(ids, "ticker", "signal")
360
- new_sub, invalid_ids = validate_ids_signals(ids, sub, "ticker")
361
- assert (new_sub == sub.sort_values("ticker")).all().all()
362
- assert invalid_ids == []
363
-
364
- def test_validate_ids_crypto(self):
365
- ids = generate_ids(9, 100)
366
- sub = generate_submission(ids, "ticker", "signal")
367
- new_sub, invalid_ids = validate_ids_crypto(ids, sub, "ticker")
368
- assert (new_sub == sub.sort_values("ticker")).all().all()
369
- assert invalid_ids == []
370
-
371
- def test_clean_predictions(self):
372
- int_sub = generate_submission(self.ids, "id", "prediction", random_vals=False)
373
- assert (
374
- (
375
- clean_predictions(
376
- self.ids,
377
- int_sub,
378
- id_col="id",
379
- rank_and_fill=False,
380
- ).reset_index()
381
- == int_sub.set_index("id").sort_index().reset_index()
382
- )
383
- .all()
384
- .all()
385
- )
386
-
387
- def test_clean_predictions_rank_and_fill(self):
388
- int_sub = generate_submission(self.ids, "id", "prediction", random_vals=False)
389
- assert np.isclose(
390
- clean_predictions(
391
- self.ids,
392
- int_sub,
393
- id_col="id",
394
- rank_and_fill=True,
395
- )
396
- .sort_values("prediction")
397
- .values.T,
398
- [[0.1, 0.3, 0.5, 0.7, 0.9]],
399
- ).all()
400
-
401
- def test_clean_predictions_empty_predictions(self):
402
- empty_predictions = pd.DataFrame(columns=["id", "prediction"])
403
- self.assertRaisesRegex(
404
- AssertionError,
405
- "predictions must not be empty",
406
- clean_predictions,
407
- self.ids,
408
- empty_predictions,
409
- id_col="id",
410
- rank_and_fill=False,
411
- )
412
-
413
- def test_clean_predictions_all_nan_predictions(self):
414
- predictions = generate_submission(self.ids, "id", "prediction")
415
- predictions["prediction"] = np.nan
416
- cleaned_predictions = clean_predictions(
417
- self.ids,
418
- predictions,
419
- id_col="id",
420
- rank_and_fill=True,
421
- )
422
- assert (cleaned_predictions == 0.5).all().all()
423
-
424
- def test_clean_predictions_mixed_valid_invalid_ids(self):
425
- mixed_ids = self.ids.tolist() + ["invalid1", "invalid2"]
426
- predictions = generate_submission(mixed_ids, "id", "prediction")
427
- cleaned_predictions = clean_predictions(
428
- self.ids,
429
- predictions,
430
- id_col="id",
431
- rank_and_fill=False,
432
- )
433
- assert (cleaned_predictions.index == self.ids.sort_values()).all()
434
-
435
- def test_clean_predictions_duplicate_ids(self):
436
- predictions = generate_submission(self.ids, "id", "prediction")
437
- predictions = pd.concat([predictions, predictions.iloc[:1]])
438
- cleaned_predictions = clean_predictions(
439
- self.ids,
440
- predictions,
441
- id_col="id",
442
- rank_and_fill=False,
443
- )
444
- assert not cleaned_predictions.index.duplicated().any()
445
-
446
-
447
- def generate_ids(id_length: int, num_rows: int) -> pd.Series:
448
- """Generates a given number of unique ascii-valued strings of a given length.
449
-
450
- Arguments:
451
- id_length -- integer length of the id
452
- num_rows -- integer number of rows to generate
453
-
454
- Return List[str]:
455
- - list of unique ascii-valued strings of the given
456
- """
457
- values: set[str] = set()
458
- while len(values) < num_rows:
459
- new_value = "".join(random.choices(string.ascii_uppercase, k=id_length))
460
- values.add(new_value)
461
- return pd.Series(list(values))
462
-
463
-
464
- def generate_submission(
465
- live_ids: List[str],
466
- id_col: str,
467
- pred_col: str,
468
- random_vals: bool = True,
469
- legacy_headers: dict = {},
470
- ) -> pd.DataFrame:
471
- """Generates a random vector with given columns and ids.
472
-
473
- Arguments:
474
- live_ids -- list of strings of ids
475
- id_col -- string name of the id column
476
- pred_col -- string name of the prediction column
477
- random -- boolean whether to generate random values or sequential
478
- legacy_headers -- dictionary of legacy headers to add to the submission
479
-
480
- Return pd.DataFrame:
481
- - submission DataFrame with the given columns and ids
482
- """
483
- rows = []
484
- for i, ticker in enumerate(live_ids):
485
- if random_vals:
486
- val = random.random()
487
- else:
488
- val = i
489
- row = {id_col: ticker, pred_col: val}
490
- for col, value in legacy_headers.items():
491
- row[col] = value
492
- rows.append(row)
493
- sub = pd.DataFrame(rows)
494
- return sub
495
-
496
-
497
- if __name__ == "__main__":
498
- unittest.main()