numerai-tools 0.5.0.dev0__tar.gz → 0.5.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,11 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: numerai-tools
3
- Version: 0.5.0.dev0
3
+ Version: 0.5.0.dev2
4
4
  Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
- Home-page: https://github.com/numerai/numerai-tools
6
- Maintainer: Numerai
7
- Maintainer-email: support@numer.ai
8
- License: MIT License
9
- Platform: OS Independent
5
+ License: MIT
6
+ Author: Numerai Engineering
7
+ Author-email: engineering@numer.ai
8
+ Requires-Python: >=3.11
10
9
  Classifier: Development Status :: 5 - Production/Stable
11
10
  Classifier: Environment :: Console
12
11
  Classifier: Intended Audience :: Science/Research
@@ -15,8 +14,15 @@ Classifier: Operating System :: OS Independent
15
14
  Classifier: Programming Language :: Python
16
15
  Classifier: Programming Language :: Python :: 3
17
16
  Classifier: Topic :: Scientific/Engineering
17
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
18
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
19
+ Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
20
+ Requires-Dist: scipy (>=1.13.0,<2.0.0)
21
+ Project-URL: Documentation, https://docs.numer.ai/
22
+ Project-URL: Homepage, https://numer.ai
23
+ Project-URL: Repository, https://github.com/numerai/numerai-tools
18
24
  Description-Content-Type: text/markdown
19
- License-File: LICENSE
20
25
 
21
26
  # numerai-tools
22
27
  A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
28
+
@@ -1,8 +1,8 @@
1
- from typing import List, Tuple, Union, Optional, TypeVar
1
+ from typing import List, Tuple, Union, Optional, TypeVar, cast, Any
2
2
 
3
3
  import numpy as np
4
- import pandas as pd # type: ignore
5
- from scipy import stats # type: ignore
4
+ import pandas as pd
5
+ from scipy import stats
6
6
  from sklearn.preprocessing import OneHotEncoder # type: ignore
7
7
 
8
8
 
@@ -43,13 +43,13 @@ def filter_sort_index(
43
43
  "s2 does not have enough overlapping ids with s1,"
44
44
  f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
45
45
  )
46
- return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
46
+ return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
47
47
 
48
48
 
49
49
  def filter_sort_index_many(
50
- inputs: List[pd.DataFrame],
50
+ inputs: List[Any],
51
51
  max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
52
- ) -> List[pd.DataFrame]:
52
+ ) -> List[Any]:
53
53
  """Filters the indices of the given list of series to match each other,
54
54
  then sorts the indices, then checks that we didn't filter too many indices
55
55
  before returning the filtered and sorted series.
@@ -75,25 +75,38 @@ def filter_sort_index_many(
75
75
 
76
76
 
77
77
  def filter_sort_top_bottom(
78
- s: pd.Series, top_bottom: int, return_concatenated: bool = True
79
- ) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
78
+ s: pd.Series, top_bottom: int
79
+ ) -> Tuple[pd.Series, pd.Series]:
80
80
  """Filters the series according to the top n and bottom n values
81
- then sorts the index and returns the filtered and sorted series.
81
+ then sorts the index and returns two filtered and sorted series
82
+ for the top and bottom values respectively.
82
83
 
83
84
  Arguments:
84
85
  s: pd.Series - the data to filter and sort
85
86
  top_bottom: int - the number of top n and bottom n values to keep
86
87
 
87
88
  Returns:
88
- pd.Series - the filtered and sorted data
89
+ Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
89
90
  """
90
91
  tb_idx = np.argsort(s, kind="stable")
91
92
  bot = s.iloc[tb_idx[:top_bottom]]
92
93
  top = s.iloc[tb_idx[-top_bottom:]]
93
- if return_concatenated:
94
- return pd.concat([top, bot]).sort_index()
95
- else:
96
- return top.sort_index(), bot.sort_index()
94
+ return top.sort_index(), bot.sort_index()
95
+
96
+
97
+ def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
98
+ """Similar to filter_sort_top_bottom, but concatenates the top and bottom series
99
+ into 1 series and then sorts the index.
100
+
101
+ Arguments:
102
+ s: pd.Series - the data to filter and sort
103
+ top_bottom: int - the number of top n and bottom n values to keep
104
+
105
+ Returns:
106
+ pd.Series - the concatenated and sorted series of top and bottom values
107
+ """
108
+ top, bot = filter_sort_top_bottom(s, top_bottom)
109
+ return pd.concat([top, bot]).sort_index()
97
110
 
98
111
 
99
112
  def rank(df: pd.DataFrame, method: str = "average") -> pd.DataFrame:
@@ -134,14 +147,14 @@ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
134
147
  return df / np.std(df, axis=0)
135
148
 
136
149
 
137
- def weight_normalize(df: pd.DataFrame) -> pd.DataFrame:
138
- """Scale a df such that all columns have absolute value sum == 1."""
139
- return df / df.abs().sum(axis=0)
150
+ def weight_normalize(s: S1) -> S1:
151
+ """Scale a input such that all columns have absolute value sum == 1."""
152
+ return cast(S1, s / s.abs().sum(axis=0))
140
153
 
141
154
 
142
- def center(df: pd.DataFrame) -> pd.DataFrame:
143
- """Shift the df such that all columns have mean == 0."""
144
- return df - df.mean()
155
+ def center(s: S1) -> S1:
156
+ """Shift the input such that all columns have mean == 0."""
157
+ return cast(S1, s - s.mean())
145
158
 
146
159
 
147
160
  def standardize(df: pd.DataFrame) -> pd.DataFrame:
@@ -180,7 +193,7 @@ def pearson_correlation(
180
193
  target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
181
194
  ) -> float:
182
195
  if top_bottom is not None and top_bottom > 0:
183
- predictions = filter_sort_top_bottom(predictions, top_bottom)
196
+ predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
184
197
  target, predictions = filter_sort_index(
185
198
  target, predictions, (1 - top_bottom / len(target))
186
199
  )
@@ -206,7 +219,7 @@ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
206
219
  """
207
220
  assert not df.isna().any().any(), "Data contains NaNs"
208
221
  assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
209
- result = np.sign(df) * np.abs(df) ** p
222
+ result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
210
223
  assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
211
224
  return result
212
225
 
@@ -222,7 +235,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
222
235
  pd.DataFrame - the gaussianized data
223
236
  """
224
237
  assert np.array_equal(df.index.sort_values(), df.index)
225
- return df.apply(lambda series: stats.norm.ppf(series))
238
+ return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
226
239
 
227
240
 
228
241
  def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
@@ -304,7 +317,7 @@ def correlation_contribution(
304
317
  m = gaussian(tie_kept_rank(meta_model.to_frame()))[meta_model.name].values
305
318
 
306
319
  # orthogonalize predictions wrt meta model
307
- neutral_preds = orthogonalize(p, m)
320
+ neutral_preds = orthogonalize(p, cast(np.ndarray, m))
308
321
 
309
322
  # convert target to buckets [-2, -1, 0, 1, 2]
310
323
  if (live_targets >= 0).all() and (live_targets <= 1).all():
@@ -315,9 +328,9 @@ def correlation_contribution(
315
328
  # filter each column to its top and bottom n predictions
316
329
  neutral_preds_df = pd.DataFrame(
317
330
  neutral_preds, columns=predictions.columns, index=predictions.index
318
- ).apply(lambda p: filter_sort_top_bottom(p, top_bottom))
319
- # create a dataframe for targets to match the filtered predictions
320
- live_targets = (
331
+ ).apply(lambda p: filter_sort_top_bottom_concat(p, top_bottom))
332
+ mmc_matrix = (
333
+ # create a dataframe for targets to match the filtered predictions
321
334
  neutral_preds_df.apply(
322
335
  lambda p: filter_sort_index(
323
336
  p,
@@ -327,19 +340,15 @@ def correlation_contribution(
327
340
  )
328
341
  .fillna(0)
329
342
  .T.values
330
- )
331
- # fillna with 0 so we don't get NaNs in the dot product
332
- neutral_preds = neutral_preds_df.fillna(0).values
333
-
334
- # multiply target and neutralized predictions
335
- # this is equivalent to covariance b/c mean = 0
336
- mmc = live_targets @ neutral_preds
337
- if top_bottom is not None and top_bottom > 0:
343
+ # then fill NaNs with 0 so we don't get NaNs in the dot product
344
+ # and mutiply target w/ neutral preds to get MMC
345
+ ) @ neutral_preds_df.fillna(0).values
338
346
  # only the diagonal is the proper score
339
- mmc = np.diag(mmc) / (top_bottom * 2)
347
+ mmc = np.diag(mmc_matrix) / (top_bottom * 2)
340
348
  else:
341
- mmc /= len(live_targets)
342
-
349
+ # multiply target and neutralized predictions
350
+ # this is equivalent to covariance b/c mean = 0
351
+ mmc = (live_targets @ neutral_preds) / len(live_targets)
343
352
  return pd.Series(mmc, index=predictions.columns)
344
353
 
345
354
 
@@ -523,10 +532,10 @@ def max_feature_correlation(
523
532
  feature_correlations = features.apply(
524
533
  lambda f: pearson_correlation(f, s, top_bottom)
525
534
  )
526
- feature_correlations = np.abs(feature_correlations)
535
+ feature_correlations = feature_correlations.abs()
527
536
  max_feature = feature_correlations.idxmax()
528
537
  max_corr = feature_correlations[max_feature]
529
- return max_feature, max_corr
538
+ return str(max_feature), max_corr
530
539
 
531
540
 
532
541
  def generate_neutralized_weights(
@@ -609,9 +618,9 @@ def meta_portfolio_contribution(
609
618
  s_prime, neutralizers, sample_weights
610
619
  )
611
620
  )
612
- w = weights[stakes.index].values
613
- s = stake_weights.values
614
- t = targets.values
621
+ w = cast(np.ndarray, weights[stakes.index].values)
622
+ s = cast(np.ndarray, stake_weights.values)
623
+ t = cast(np.ndarray, targets.values)
615
624
  swp = w @ s
616
625
  swp = swp - swp.mean()
617
626
  l1_norm = np.sum(np.abs(swp))
@@ -40,8 +40,8 @@ def churn(
40
40
  float - the churn between the two series
41
41
  """
42
42
  if top_bottom is not None and top_bottom > 0:
43
- s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
44
- s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
43
+ s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom)
44
+ s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom)
45
45
  top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
46
46
  bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
47
47
  avg_overlap = (top_overlap + bot_overlap) / 2
@@ -85,11 +85,13 @@ def neutral_weight(
85
85
  neutralizer: pd.DataFrame,
86
86
  weight: pd.Series,
87
87
  ) -> pd.Series:
88
- s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())[signal_col]
89
- s_prime, neutralizer, weight = filter_sort_index_many( # type: ignore
88
+ s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())
89
+ s_prime, neutralizer, weight = filter_sort_index_many(
90
90
  [s_prime, neutralizer, weight]
91
91
  )
92
- neutral_weights = generate_neutralized_weights(s_prime, neutralizer, weight)
92
+ neutral_weights = generate_neutralized_weights(
93
+ s_prime[signal_col], neutralizer, weight
94
+ )
93
95
  neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
94
96
  return neutral_weights.sort_index()
95
97
 
@@ -161,10 +163,10 @@ def calculate_max_churn_and_turnover(
161
163
  prev_sub = prev_week_subs[datestamp]
162
164
  prev_neutralizer = prev_neutralizers[datestamp]
163
165
  prev_weight = prev_sample_weights[datestamp]
164
- prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub) # type: ignore
166
+ prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub)
165
167
  prev_universe = universe.reset_index().set_index(prev_ticker_col)
166
168
  filtered_prev_sub_df, _ = validate_ids_signals(
167
- prev_universe.index, prev_sub, prev_ticker_col
169
+ prev_universe.index.to_series(), prev_sub, prev_ticker_col
168
170
  )
169
171
  # in case the previous submission has a different ticker column,
170
172
  # remap the ticker column of prev data to the current ticker column
@@ -170,8 +170,7 @@ def clean_predictions(
170
170
  predictions: pd.DataFrame,
171
171
  id_col: str,
172
172
  rank_and_fill: bool,
173
- left_join_on_ids: bool = False,
174
- ) -> pd.Series:
173
+ ) -> pd.DataFrame:
175
174
  """Prepare predictions for submission to Numerai.
176
175
  Filters out ids not in live data, drops duplicates, sets ids as index,
177
176
  then optionally ranks (keeping ties) and fills NaNs with 0.5.
@@ -0,0 +1,45 @@
1
+ [project]
2
+ name = "numerai-tools"
3
+ version = "0.5.0.dev2"
4
+ description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions."
5
+ authors = [
6
+ {name = "Numerai Engineering",email = "engineering@numer.ai"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ classifiers = [
12
+ "Development Status :: 5 - Production/Stable",
13
+ "Environment :: Console",
14
+ "Intended Audience :: Science/Research",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Programming Language :: Python",
18
+ "Programming Language :: Python :: 3",
19
+ "Topic :: Scientific/Engineering",
20
+ ]
21
+
22
+ [project.urls]
23
+ homepage = "https://numer.ai"
24
+ repository = "https://github.com/numerai/numerai-tools"
25
+ documentation = "https://docs.numer.ai/"
26
+
27
+ [tool.poetry]
28
+ packages = [
29
+ {include = "numerai_tools", from = "."},
30
+ ]
31
+
32
+ [tool.poetry.dependencies]
33
+ pandas = "^2.2.2"
34
+ numpy = "^2.0.0"
35
+ scipy = "^1.13.0"
36
+ scikit-learn = "^1.5.0"
37
+
38
+ [tool.poetry.group.dev.dependencies]
39
+ pytest = "^8.3.4"
40
+ mypy = "^1.15.0"
41
+ ruff = "^0.5.4"
42
+
43
+ [build-system]
44
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
45
+ build-backend = "poetry.core.masonry.api"
@@ -1,22 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: numerai_tools
3
- Version: 0.5.0.dev0
4
- Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
5
- Home-page: https://github.com/numerai/numerai-tools
6
- Maintainer: Numerai
7
- Maintainer-email: support@numer.ai
8
- License: MIT License
9
- Platform: OS Independent
10
- Classifier: Development Status :: 5 - Production/Stable
11
- Classifier: Environment :: Console
12
- Classifier: Intended Audience :: Science/Research
13
- Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Operating System :: OS Independent
15
- Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3
17
- Classifier: Topic :: Scientific/Engineering
18
- Description-Content-Type: text/markdown
19
- License-File: LICENSE
20
-
21
- # numerai-tools
22
- A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
@@ -1,16 +0,0 @@
1
- LICENSE
2
- README.md
3
- setup.py
4
- numerai_tools/__init__.py
5
- numerai_tools/py.typed
6
- numerai_tools/scoring.py
7
- numerai_tools/signals.py
8
- numerai_tools/submissions.py
9
- numerai_tools.egg-info/PKG-INFO
10
- numerai_tools.egg-info/SOURCES.txt
11
- numerai_tools.egg-info/dependency_links.txt
12
- numerai_tools.egg-info/requires.txt
13
- numerai_tools.egg-info/top_level.txt
14
- tests/test_scoring.py
15
- tests/test_signals.py
16
- tests/test_submissions.py
@@ -1,4 +0,0 @@
1
- pandas<3.0.0,>=2.2.2
2
- numpy<3.0.0,>=2.0.0
3
- scipy<2.0.0,>=1.13.0
4
- scikit-learn<2.0.0,>=1.5.0
@@ -1 +0,0 @@
1
- numerai_tools
@@ -1,4 +0,0 @@
1
- [egg_info]
2
- tag_build =
3
- tag_date = 0
4
-
@@ -1,47 +0,0 @@
1
- from setuptools import setup
2
- from setuptools import find_packages
3
-
4
- VERSION = "0.5.0.dev0"
5
-
6
-
7
- def load(path):
8
- return open(path, "r").read()
9
-
10
-
11
- classifiers = [
12
- "Development Status :: 5 - Production/Stable",
13
- "Environment :: Console",
14
- "Intended Audience :: Science/Research",
15
- "License :: OSI Approved :: MIT License",
16
- "Operating System :: OS Independent",
17
- "Programming Language :: Python",
18
- "Programming Language :: Python :: 3",
19
- "Topic :: Scientific/Engineering",
20
- ]
21
-
22
-
23
- if __name__ == "__main__":
24
- setup(
25
- name="numerai_tools",
26
- version=VERSION,
27
- maintainer="Numerai",
28
- maintainer_email="support@numer.ai",
29
- description="A collection of open-source tools to help interact with Numerai, model data, and automate submissions.",
30
- long_description=load("README.md"),
31
- long_description_content_type="text/markdown",
32
- url="https://github.com/numerai/numerai-tools",
33
- platforms="OS Independent",
34
- classifiers=classifiers,
35
- license="MIT License",
36
- package_data={
37
- "numerai_tools": ["LICENSE", "README.md", "py.typed"],
38
- },
39
- packages=find_packages(exclude=["tests"]),
40
- install_requires=[
41
- # pandas 2.2.2 was the first version to support numpy 2
42
- "pandas>=2.2.2,<3.0.0",
43
- "numpy>=2.0.0,<3.0.0",
44
- "scipy>=1.13.0,<2.0.0",
45
- "scikit-learn>=1.5.0,<2.0.0",
46
- ],
47
- )