PyPI - numerai-tools - Versions diffs - 0.0.1__tar.gz - Mend

numerai-tools 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

numerai_tools-0.0.1/LICENSE +21 -0
numerai_tools-0.0.1/PKG-INFO +21 -0
numerai_tools-0.0.1/README.md +2 -0
numerai_tools-0.0.1/numerai_tools/__init__.py +0 -0
numerai_tools-0.0.1/numerai_tools/scoring.py +228 -0
numerai_tools-0.0.1/numerai_tools.egg-info/PKG-INFO +21 -0
numerai_tools-0.0.1/numerai_tools.egg-info/SOURCES.txt +11 -0
numerai_tools-0.0.1/numerai_tools.egg-info/dependency_links.txt +1 -0
numerai_tools-0.0.1/numerai_tools.egg-info/requires.txt +4 -0
numerai_tools-0.0.1/numerai_tools.egg-info/top_level.txt +1 -0
numerai_tools-0.0.1/setup.cfg +4 -0
numerai_tools-0.0.1/setup.py +45 -0
numerai_tools-0.0.1/tests/test_scoring.py +142 -0

numerai_tools-0.0.1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2023 Numerai
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

numerai_tools-0.0.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,21 @@
+Metadata-Version: 2.1
+Name: numerai_tools
+Version: 0.0.1
+Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+Home-page: https://github.com/numerai/numerai-tools
+Maintainer: Numerai
+Maintainer-email: support@numer.ai
+License: MIT License
+Description: # numerai-tools
+        A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+Platform: OS Independent
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Scientific/Engineering
+Description-Content-Type: text/markdown

numerai_tools-0.0.1/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # numerai-tools
2	+ A collection of open-source tools to help interact with Numerai, model data, and automate submissions.

numerai_tools-0.0.1/numerai_tools/__init__.py ADDED Viewed

File without changes

numerai_tools-0.0.1/numerai_tools/scoring.py ADDED Viewed

@@ -0,0 +1,228 @@
+from typing import List, Tuple, Union
+import numpy as np
+import pandas as pd
+from scipy import stats
+from sklearn.preprocessing import OneHotEncoder
+# this is primarily used b/c round 326 had too many stocks,
+# so we need to filter out the unnecessary ids here just in case
+# it's also just convenient way to ensure everything is sorted/matching
+def filter_sort_index(
+    s1: Union[pd.DataFrame, pd.Series],
+    s2: Union[pd.DataFrame, pd.Series],
+    max_filtered_ratio: float = 0.2,
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    ids = s1.dropna().index.intersection(s2.dropna().index)
+    # ensure we didn't filter too many ids
+    assert len(ids) / len(s1) >= (1 - max_filtered_ratio)
+    assert len(ids) / len(s2) >= (1 - max_filtered_ratio)
+    return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
+def rank(df: pd.DataFrame, method: str = 'average') -> pd.DataFrame:
+    """Percentile rank each column of a pandas DataFrame, centering values around 0.5
+    Arguments:
+        df: pd.DataFrame - the data to rank
+        method: str - the pandas ranking method to use, options:
+            'average' (default) - keeps ties
+            'first' - breaks ties by index
+    Returns:
+        pd.DataFrame - the ranked DataFrame
+    """
+    assert np.array_equal(df.index.sort_values(), df.index), "unsorted index found"
+    return df.apply(
+        lambda series: (series.rank(method=method).values - 0.5) / series.count()
+    )
+def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
+    # rank columns, breaking ties by index
+    return rank(df, "first")
+def tie_kept_rank(df: pd.DataFrame) -> pd.DataFrame:
+    # rank columns, but keep ties
+    return rank(df, "average")
+def min_max_normalize(s: pd.Series) -> pd.Series:
+    # scale a series to be between 0 and 1
+    return (s - s.min()) / (s.max() - s.min())
+def validate_indices(live_targets: pd.Series, predictions: pd.Series) -> None:
+    # ensure the ids are equivalent and sorted
+    assert np.array_equal(predictions.index, live_targets.index.sort_values())
+    assert np.array_equal(live_targets.index, live_targets.index.sort_values())
+    assert np.array_equal(predictions.index, predictions.index.sort_values())
+    # ensure no nans
+    assert not predictions.isna().any()
+    assert not live_targets.isna().any()
+def correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
+    validate_indices(live_targets, predictions)
+    # calculate correlation coefficient
+    return np.corrcoef(live_targets, predictions)[0, 1]
+def tie_broken_rank_correlation(
+    live_targets: pd.Series, predictions: pd.Series
+) -> float:
+    # percentile rank the predictions and get the correlation with live_targets
+    ranked_predictions = tie_broken_rank(predictions.to_frame())[predictions.name]
+    return correlation(live_targets, ranked_predictions)
+def spearman_correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
+    validate_indices(live_targets, predictions)
+    # calculate corr
+    return live_targets.corr(predictions, method="spearman")
+def pearson_correlation(live_targets: pd.Series, predictions: pd.Series) -> float:
+    validate_indices(live_targets, predictions)
+    # calculate corr
+    return live_targets.corr(predictions, method="pearson")
+def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
+    """Raise given predictions series to the given power.
+    Arguments:
+        df: pd.DataFrame - the data to raise to the given power
+        p: float - the power to which we exponentiate the data
+    Returns:
+        pd.DataFrame - the predictions raised to the given power,
+            each column should be at least 90% correlated with the original data
+    """
+    assert not df.isna().any().any(), "Data contains NaNs"
+    assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
+    result = np.sign(df) * np.abs(df) ** p
+    assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
+    return result
+def gaussian(df: pd.DataFrame) -> pd.DataFrame:
+    """Gaussianize each column of a pandas DataFrame using a normal percent point func
+    Arguments:
+        df: pd.DataFrame - the data to gaussianize
+    Returns:
+        pd.DataFrame - the gaussianized data
+    """
+    assert np.array_equal(df.index.sort_values(), df.index)
+    return df.apply(lambda series: stats.norm.ppf(series))
+def neutralize(
+    df: pd.DataFrame, neutralizers: np.ndarray, proportion: float = 1.0
+) -> pd.DataFrame:
+    """Neutralize each column of a given DataFrame by each feature in a given
+    neutralizers DataFrame.
+    Arguments:
+        df: pd.DataFrame - the data with columns to neutralize
+        neutralizers: pd.DataFrame - the neutralizer data with features as columns
+        proportion: float - the degree to which neutralization occurs
+    Returns:
+        pd.DataFrame - the neutralized data
+    """
+    assert not neutralizers.isna().any().any(), "Neutralizers contain NaNs"
+    assert len(df.index) == len(neutralizers.index), "Indices don't match"
+    assert (df.index == neutralizers.index).all(), "Indices don't match"
+    df[df.columns[df.std() == 0]] = np.nan
+    df_arr = df.values
+    neutralizer_arr = neutralizers.values
+    inverse_neutralizers = np.linalg.pinv(neutralizer_arr, rcond=1e-6)
+    adjustments = proportion * neutralizer_arr.dot(inverse_neutralizers.dot(df_arr))
+    neutral = df_arr - adjustments
+    neutral /= np.std(neutral, axis=0)
+    return pd.DataFrame(neutral, index=df.index, columns=df.columns)
+def one_hot_encode(
+    df: pd.DataFrame, columns: List[str], dtype: type = np.float64
+) -> pd.DataFrame:
+    """One-hot encodes specified columns in a pandas dataframe.
+    Each column i should have x_i discrete values (eg. categories, bucket values, etc.)
+    and will be converted to x_i columns that each have 0s for rows that don't have
+    the associated value and 1s for rows that do have that value.
+    Arguments:
+        df: pd.DataFrame - the data with columns to one-hot encode
+        columns: List[str] - list of columns names to replace w/ one-hot encoding
+        dtype: type = np.float64 - the target datatype for the resulting columns
+    Returns:
+        pd.DataFrame - original data, but specified cols replaced w/ one-hot encoding
+    """
+    for col in columns:
+        encoder = OneHotEncoder(dtype=dtype)
+        one_hot = encoder.fit_transform(df[[col]])
+        one_hot = pd.DataFrame(
+            one_hot.toarray(),
+            columns=encoder.get_feature_names(),
+            index=df.index,
+        )
+        df = df.join(one_hot).drop(columns=col)
+    return df
+def tie_kept_rank__gaussianize__pow_1_5(df: pd.DataFrame) -> pd.DataFrame:
+    """Perform the 3 functions in order on the given pandas DataFrame.
+    Will tie-kept rank then gaussianize then exponentiate to the 1.5 power.
+    Arguments:
+        df: pd.DataFrame - the data to transform
+    Returns:
+        pd.DataFrame - the resulting data after applying the 3 functions
+    """
+    return power(gaussian(tie_kept_rank(df)), 1.5)
+def tie_kept_rank__gaussianize__neutralize(
+    df: pd.DataFrame, neutralizers: pd.DataFrame
+) -> pd.DataFrame:
+    """Perform the 3 functions in order on the given pandas DataFrame.
+    Will tie-kept rank then gaussianize then neutralize the df to the neutralizers.
+    Arguments:
+        df: pd.DataFrame - the data to transform
+    Returns:
+        pd.DataFrame - the resulting data after applying the 3 functions
+    """
+    return neutralize(gaussian(tie_kept_rank(df)), neutralizers)
+def numerai_corr(predictions: pd.DataFrame, targets: pd.Series) -> pd.Series:
+    """Recenter the target on 0, filter and sort indices, apply tie_kept_rank__gaussianize__pow_1_5
+    to the predictions, raise the targets to the 1.5 power, then calculate the
+    pearson correlation between the predictions and targets.
+    Arguments:
+        predictions: pd.DataFrame - the predictions to evaluate
+        targets: pd.Series - the live targets to evaluate against
+    Returns:
+        pd.Series - the resulting correlation scores for each column in predictions
+    """
+    targets -= targets.mean()
+    targets, predictions = filter_sort_index(targets, predictions)
+    predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
+    targets = power(targets.to_frame(), 1.5)[targets.name]
+    scores = predictions.apply(lambda sub: pearson_correlation(targets, sub))
+    return scores

numerai_tools-0.0.1/numerai_tools.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,21 @@
+Metadata-Version: 2.1
+Name: numerai-tools
+Version: 0.0.1
+Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+Home-page: https://github.com/numerai/numerai-tools
+Maintainer: Numerai
+Maintainer-email: support@numer.ai
+License: MIT License
+Description: # numerai-tools
+        A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+Platform: OS Independent
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Scientific/Engineering
+Description-Content-Type: text/markdown

numerai_tools-0.0.1/numerai_tools.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+LICENSE
+README.md
+setup.py
+numerai_tools/__init__.py
+numerai_tools/scoring.py
+numerai_tools.egg-info/PKG-INFO
+numerai_tools.egg-info/SOURCES.txt
+numerai_tools.egg-info/dependency_links.txt
+numerai_tools.egg-info/requires.txt
+numerai_tools.egg-info/top_level.txt
+tests/test_scoring.py

numerai_tools-0.0.1/numerai_tools.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

numerai_tools-0.0.1/numerai_tools.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,4 @@
+pandas==1.2.4
+numpy==1.20.3
+scipy==1.2.1
+sklearn==0.0

numerai_tools-0.0.1/numerai_tools.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ numerai_tools

numerai_tools-0.0.1/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

numerai_tools-0.0.1/setup.py ADDED Viewed

@@ -0,0 +1,45 @@
+from setuptools import setup
+from setuptools import find_packages
+VERSION = '0.0.1'
+def load(path):
+    return open(path, 'r').read()
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Environment :: Console",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Topic :: Scientific/Engineering",
+]
+if __name__ == "__main__":
+    setup(
+        name="numerai_tools",
+        version=VERSION,
+        maintainer="Numerai",
+        maintainer_email="support@numer.ai",
+        description="A collection of open-source tools to help interact with Numerai, model data, and automate submissions.",
+        long_description=load('README.md'),
+        long_description_content_type='text/markdown',
+        url='https://github.com/numerai/numerai-tools',
+        platforms="OS Independent",
+        classifiers=classifiers,
+        license='MIT License',
+        package_data={'numerai': ['LICENSE', 'README.md']},
+        packages=find_packages(exclude=['tests']),
+        install_requires=[
+            "pandas==1.2.4",
+            "numpy==1.20.3",
+            "scipy==1.2.1",
+            "sklearn==0.0",
+        ],
+    )

numerai_tools-0.0.1/tests/test_scoring.py ADDED Viewed

@@ -0,0 +1,142 @@
+import unittest
+import numpy as np
+import pandas as pd
+from numerai_tools.scoring import (
+    correlation,
+    tie_broken_rank_correlation,
+    spearman_correlation,
+    pearson_correlation,
+    tie_broken_rank,
+    tie_kept_rank,
+    gaussian,
+    neutralize,
+    one_hot_encode,
+    power,
+    tie_kept_rank__gaussianize__pow_1_5,
+)
+class TestScoring(unittest.TestCase):
+    def setUp(self):
+        print(f'\n running {type(self).__name__}')
+        self.up = pd.Series(list(range(5))).rename('up')
+        self.down = pd.Series(list(reversed(range(5)))).rename('down')
+        self.up_down = pd.Series([1, 0, 1, 0, 1]).rename('up_down')
+        self.down_up = (1 - self.up_down).rename('down_up')
+        self.up_float = (self.up / self.up.max()).rename('up_float')
+        self.pos_neg = pd.Series([0, -0, 0.5, -0.5, 1.0, -1.0, 2.0, -2.0]).rename(
+            'pos_neg'
+        )
+    def test_correlation(self):
+        assert np.isclose(correlation(self.up, self.up), 1)
+        assert np.isclose(correlation(self.up, self.down), -1)
+        assert np.isclose(correlation(self.up, self.up_down), 0)
+        assert np.isclose(correlation(self.up, self.down_up), 0)
+    def test_tie_broken_rank_correlation(self):
+        assert np.isclose(tie_broken_rank_correlation(self.up, self.up), 1)
+        assert np.isclose(tie_broken_rank_correlation(self.up, self.down), -1)
+        # tie_broken_rank_correlation ranks the submission not the targets
+        assert np.isclose(tie_broken_rank_correlation(self.up, self.up_down), 0.5)
+        assert np.isclose(tie_broken_rank_correlation(self.up, self.down_up), 0.5)
+        assert np.isclose(tie_broken_rank_correlation(self.up_down, self.up), 0)
+        assert np.isclose(tie_broken_rank_correlation(self.down_up, self.up), 0)
+    def test_spearman_correlation(self):
+        assert np.isclose(spearman_correlation(self.up, self.up), 1)
+        assert np.isclose(spearman_correlation(self.up, self.down), -1)
+        assert np.isclose(spearman_correlation(self.up, self.up_down), 0)
+        assert np.isclose(spearman_correlation(self.up, self.down_up), 0)
+        assert np.isclose(spearman_correlation(self.up_down, self.up), 0)
+        assert np.isclose(spearman_correlation(self.down_up, self.up), 0)
+    def test_pearson_correlation(self):
+        assert np.isclose(pearson_correlation(self.up, self.up), 1)
+        assert np.isclose(pearson_correlation(self.up, self.down), -1)
+        assert np.isclose(pearson_correlation(self.up, self.up_down), 0)
+        assert np.isclose(pearson_correlation(self.up, self.down_up), 0)
+        assert np.isclose(pearson_correlation(self.up_down, self.up), 0)
+        assert np.isclose(pearson_correlation(self.down_up, self.up), 0)
+    def test_tie_broken_rank(self):
+        assert np.isclose(
+            tie_broken_rank(self.up.to_frame()).T, [0.1, 0.3, 0.5, 0.7, 0.9]
+        ).all()
+        assert np.isclose(
+            tie_broken_rank(self.up_down.to_frame()).T, [0.5, 0.1, 0.7, 0.3, 0.9]
+        ).all()
+    def test_tie_kept_rank(self):
+        assert np.isclose(
+            tie_kept_rank(self.up.to_frame()).T, [0.1, 0.3, 0.5, 0.7, 0.9]
+        ).all()
+        assert np.isclose(
+            tie_kept_rank(self.up_down.to_frame()).T, [0.7, 0.2, 0.7, 0.2, 0.7]
+        ).all()
+    def test_gaussian(self):
+        assert np.isclose(
+            gaussian(self.up_float).values.T,
+            [-np.inf, -0.6744897501960817, 0, 0.6744897501960817, np.inf],
+        ).all()
+    def test_neutralize(self):
+        reciprocal_std_dev = 1 / self.up_down.values.std()
+        assert np.isclose(
+            neutralize(self.up_down.to_frame(), self.down_up.to_frame()).values.T,
+            [reciprocal_std_dev, 0, reciprocal_std_dev, 0, reciprocal_std_dev],
+        ).all()
+        # ensure it works for multiple submissions/neutralizers
+        assert np.isclose(
+            neutralize(
+                pd.concat([self.up_down, self.up_down], axis=1),
+                pd.concat([self.down_up, self.down_up], axis=1),
+            ).values.T,
+            [
+                [reciprocal_std_dev, 0, reciprocal_std_dev, 0, reciprocal_std_dev],
+                [reciprocal_std_dev, 0, reciprocal_std_dev, 0, reciprocal_std_dev],
+            ],
+        ).all()
+    def test_one_hot_encode(self):
+        assert np.isclose(
+            one_hot_encode(self.up.to_frame(), ['up']).values.T,
+            [
+                [1.0, 0.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0, 0.0],
+                [0.0, 0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0, 0.0, 1.0],
+            ],
+        ).all()
+    def test_power(self):
+        assert np.isclose(
+            power(self.pos_neg.to_frame(), 1.5),
+            [
+                [0.0],
+                [0.0],
+                [0.3535533905932738],
+                [-0.3535533905932738],
+                [1.0000000000000000],
+                [-1.0000000000000000],
+                [2.8284271247461903],
+                [-2.8284271247461903],
+            ],
+        ).all()
+    def test_tie_kept_rank__gaussianize__pow_1_5(self):
+        assert np.isclose(
+            tie_kept_rank__gaussianize__pow_1_5(self.up_float.to_frame()),
+            [
+                [-1.4507885796854221],
+                [-0.3797472709071263],
+                [0.0000000000000000],
+                [0.3797472709071261],
+                [1.4507885796854221],
+            ],
+        ).all()