PyPI - numerai-tools - Versions diffs - 0.4.3__tar.gz → 0.5.0.dev1__tar.gz - Mend

numerai-tools 0.4.3tar.gz → 0.5.0.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: numerai_tools
-Version: 0.4.3
+Version: 0.5.0.dev1
 Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
 Home-page: https://github.com/numerai/numerai-tools
 Maintainer: Numerai

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools/scoring.py RENAMED Viewed

@@ -1,8 +1,8 @@
-from typing import List, Tuple, Union, Optional, TypeVar
+from typing import List, Tuple, Union, Optional, TypeVar, cast, Any
 import numpy as np
-import pandas as pd  # type: ignore
-from scipy import stats  # type: ignore
+import pandas as pd
+from scipy import stats
 from sklearn.preprocessing import OneHotEncoder  # type: ignore
@@ -43,12 +43,13 @@ def filter_sort_index(
         "s2 does not have enough overlapping ids with s1,"
         f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
     )
-    return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
+    return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
 def filter_sort_index_many(
-    inputs: List[S1], max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
-) -> List[S1]:
+    inputs: List[Any],
+    max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
+) -> List[Any]:
     """Filters the indices of the given list of series to match each other,
     then sorts the indices, then checks that we didn't filter too many indices
     before returning the filtered and sorted series.
@@ -74,25 +75,38 @@ def filter_sort_index_many(
 def filter_sort_top_bottom(
-    s: pd.Series, top_bottom: int, return_concatenated: bool = True
-) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
+    s: pd.Series, top_bottom: int
+) -> Tuple[pd.Series, pd.Series]:
     """Filters the series according to the top n and bottom n values
-    then sorts the index and returns the filtered and sorted series.
+    then sorts the index and returns two filtered and sorted series
+    for the top and bottom values respectively.
     Arguments:
         s: pd.Series - the data to filter and sort
         top_bottom: int - the number of top n and bottom n values to keep
     Returns:
-        pd.Series - the filtered and sorted data
+        Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
     """
     tb_idx = np.argsort(s, kind="stable")
     bot = s.iloc[tb_idx[:top_bottom]]
     top = s.iloc[tb_idx[-top_bottom:]]
-    if return_concatenated:
-        return pd.concat([top, bot]).sort_index()
-    else:
-        return top.sort_index(), bot.sort_index()
+    return top.sort_index(), bot.sort_index()
+def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
+    """Similar to filter_sort_top_bottom, but concatenates the top and bottom series
+    into 1 series and then sorts the index.
+    Arguments:
+        s: pd.Series - the data to filter and sort
+        top_bottom: int - the number of top n and bottom n values to keep
+    Returns:
+        pd.Series - the concatenated and sorted series of top and bottom values
+    """
+    top, bot = filter_sort_top_bottom(s, top_bottom)
+    return pd.concat([top, bot]).sort_index()
 def rank(df: pd.DataFrame, method: str = "average") -> pd.DataFrame:
@@ -133,14 +147,14 @@ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
     return df / np.std(df, axis=0)
-def weight_normalize(df: pd.DataFrame) -> pd.DataFrame:
-    """Scale a df such that all columns have absolute value sum == 1."""
-    return df / df.abs().sum(axis=0)
+def weight_normalize(s: S1) -> S1:
+    """Scale a input such that all columns have absolute value sum == 1."""
+    return cast(S1, s / s.abs().sum(axis=0))
-def center(df: pd.DataFrame) -> pd.DataFrame:
-    """Shift the df such that all columns have mean == 0."""
-    return df - df.mean()
+def center(s: S1) -> S1:
+    """Shift the input such that all columns have mean == 0."""
+    return cast(S1, s - s.mean())
 def standardize(df: pd.DataFrame) -> pd.DataFrame:
@@ -179,7 +193,7 @@ def pearson_correlation(
     target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
 ) -> float:
     if top_bottom is not None and top_bottom > 0:
-        predictions = filter_sort_top_bottom(predictions, top_bottom)
+        predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
         target, predictions = filter_sort_index(
             target, predictions, (1 - top_bottom / len(target))
         )
@@ -205,7 +219,7 @@ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
     """
     assert not df.isna().any().any(), "Data contains NaNs"
     assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
-    result = np.sign(df) * np.abs(df) ** p
+    result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
     assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
     return result
@@ -221,7 +235,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame - the gaussianized data
     """
     assert np.array_equal(df.index.sort_values(), df.index)
-    return df.apply(lambda series: stats.norm.ppf(series))
+    return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
 def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
@@ -303,7 +317,7 @@ def correlation_contribution(
     m = gaussian(tie_kept_rank(meta_model.to_frame()))[meta_model.name].values
     # orthogonalize predictions wrt meta model
-    neutral_preds = orthogonalize(p, m)
+    neutral_preds = orthogonalize(p, cast(np.ndarray, m))
     # convert target to buckets [-2, -1, 0, 1, 2]
     if (live_targets >= 0).all() and (live_targets <= 1).all():
@@ -314,9 +328,9 @@ def correlation_contribution(
         # filter each column to its top and bottom n predictions
         neutral_preds_df = pd.DataFrame(
             neutral_preds, columns=predictions.columns, index=predictions.index
-        ).apply(lambda p: filter_sort_top_bottom(p, top_bottom))
-        # create a dataframe for targets to match the filtered predictions
-        live_targets = (
+        ).apply(lambda p: filter_sort_top_bottom_concat(p, top_bottom))
+        mmc_matrix = (
+            # create a dataframe for targets to match the filtered predictions
             neutral_preds_df.apply(
                 lambda p: filter_sort_index(
                     p,
@@ -326,19 +340,15 @@ def correlation_contribution(
             )
             .fillna(0)
             .T.values
-        )
-        # fillna with 0 so we don't get NaNs in the dot product
-        neutral_preds = neutral_preds_df.fillna(0).values
-    # multiply target and neutralized predictions
-    # this is equivalent to covariance b/c mean = 0
-    mmc = live_targets @ neutral_preds
-    if top_bottom is not None and top_bottom > 0:
+            # then fill NaNs with 0 so we don't get NaNs in the dot product
+            #  and mutiply target w/ neutral preds to get MMC
+        ) @ neutral_preds_df.fillna(0).values
         # only the diagonal is the proper score
-        mmc = np.diag(mmc) / (top_bottom * 2)
+        mmc = np.diag(mmc_matrix) / (top_bottom * 2)
     else:
-        mmc /= len(live_targets)
+        # multiply target and neutralized predictions
+        # this is equivalent to covariance b/c mean = 0
+        mmc = (live_targets @ neutral_preds) / len(live_targets)
     return pd.Series(mmc, index=predictions.columns)
@@ -522,10 +532,10 @@ def max_feature_correlation(
     feature_correlations = features.apply(
         lambda f: pearson_correlation(f, s, top_bottom)
     )
-    feature_correlations = np.abs(feature_correlations)
+    feature_correlations = feature_correlations.abs()
     max_feature = feature_correlations.idxmax()
     max_corr = feature_correlations[max_feature]
-    return max_feature, max_corr
+    return str(max_feature), max_corr
 def generate_neutralized_weights(
@@ -608,9 +618,9 @@ def meta_portfolio_contribution(
             s_prime, neutralizers, sample_weights
         )
     )
-    w = weights[stakes.index].values
-    s = stake_weights.values
-    t = targets.values
+    w = cast(np.ndarray, weights[stakes.index].values)
+    s = cast(np.ndarray, stake_weights.values)
+    t = cast(np.ndarray, targets.values)
     swp = w @ s
     swp = swp - swp.mean()
     l1_norm = np.sum(np.abs(swp))

numerai_tools-0.5.0.dev1/numerai_tools/signals.py ADDED Viewed

@@ -0,0 +1,217 @@
+from typing import Tuple, Optional
+from numerai_tools.submissions import validate_headers_signals, validate_ids_signals
+from numerai_tools.scoring import (
+    filter_sort_index,
+    filter_sort_top_bottom,
+    spearman_correlation,
+    tie_kept_rank,
+    tie_kept_rank__gaussianize__pow_1_5,
+    filter_sort_index_many,
+    generate_neutralized_weights,
+    weight_normalize,
+    center,
+)
+import pandas as pd
+def churn(
+    s1: pd.Series,
+    s2: pd.Series,
+    top_bottom: Optional[int] = None,
+) -> float:
+    """Calculate the churn between two series. Churn is the proportion of elements
+    that are different between the two series.
+    For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
+    If top_bottom is provided, the churn is calculated as the average of the % of
+    tickers that stay in the top and bottom predictions. This is only relevant when
+    the series are rank signals and not portfolio weights.
+    Arguments:
+        s1: pd.Series - the first series to compare
+        s2: pd.Series - the second series to compare
+        top_bottom: Optional[int] - the number of top and bottom predictions to use
+                                    when calculating the correlation. Results in
+                                    2*top_bottom predictions.
+    Returns:
+        float - the churn between the two series
+    """
+    if top_bottom is not None and top_bottom > 0:
+        s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom)
+        s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom)
+        top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
+        bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
+        avg_overlap = (top_overlap + bot_overlap) / 2
+        return 1 - avg_overlap
+    s1, s2 = filter_sort_index(s1, s2)
+    assert s1.std() > 0, "s1 must have non-zero standard deviation"
+    assert s2.std() > 0, "s2 must have non-zero standard deviation"
+    return 1 - spearman_correlation(s1, s2)
+def turnover(
+    s1: pd.Series,
+    s2: pd.Series,
+):
+    """Calculate the turnover between two series. Turnover is the total change in weights between
+    the two series divided by 2.
+    For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
+    and calculate turnover as the absolute total difference between the two series divided by 2.
+    This is only relevant when the series are portfolio weights and not rank signals.
+    Arguments:
+        s1: pd.Series - the first series to compare
+        s2: pd.Series - the second series to compare
+        top_bottom: Optional[int] - the number of top and bottom predictions to use
+                                    when calculating the correlation. Results in
+                                    2*top_bottom predictions.
+    Returns:
+        float - the turnover between the two series
+    """
+    s1, s2 = filter_sort_index(s1, s2)
+    turnover = (s1 - s2).abs().sum() / 2
+    return turnover
+def neutral_weight(
+    submission: pd.Series,
+    signal_col: str,
+    neutralizer: pd.DataFrame,
+    weight: pd.Series,
+) -> pd.Series:
+    s_prime = tie_kept_rank__gaussianize__pow_1_5(submission.to_frame())
+    s_prime, neutralizer, weight = filter_sort_index_many(
+        [s_prime, neutralizer, weight]
+    )
+    neutral_weights = generate_neutralized_weights(
+        s_prime[signal_col], neutralizer, weight
+    )
+    neutral_weights = weight_normalize(center(neutral_weights.to_frame()))[0]
+    return neutral_weights.sort_index()
+def remap_ticker_col(
+    predictions: pd.DataFrame,
+    universe: pd.DataFrame,
+    ticker_col: str,
+) -> pd.DataFrame:
+    return (
+        predictions.join(universe, how="right")
+        .reset_index()
+        .set_index(ticker_col)
+        .sort_index()
+    )
+def rank_and_fill_signal(
+    universe: pd.DataFrame,
+    submission: pd.Series,
+    signal_col: str,
+) -> pd.Series:
+    uni_joined_sub = universe.sort_index().join(
+        tie_kept_rank(submission.sort_index().to_frame())
+    )[[signal_col]]
+    filled_sub = uni_joined_sub.fillna(uni_joined_sub.median()).sort_index()
+    return filled_sub[signal_col]
+def calculate_max_churn_and_turnover(
+    curr_sub: pd.DataFrame,
+    curr_neutralizer: pd.DataFrame,
+    curr_weight: pd.Series,
+    prev_week_subs: dict[str, pd.DataFrame],
+    prev_neutralizers: dict[str, pd.DataFrame],
+    prev_sample_weights: dict[str, pd.Series],
+    universe: pd.DataFrame,
+    curr_signal_col: str,
+    curr_ticker_col: str,
+) -> Tuple[float, float]:
+    """Calculate the maximum churn and turnover with respect to previous submissions.
+    Arguments:
+        curr_sub -- the current submission
+        curr_neutralizer -- the neutralizer DataFrame for the current submission
+        curr_weight -- the sample weights Series for the current submission
+        prev_week_subs -- a dictionary of datestamps to submissions
+        prev_neutralizers -- a dictionary of datestamps to neutralizers
+        prev_sample_weights -- a dictionary of datestamps to sample weights
+        universe -- the internal universe DataFrame
+        curr_signal_col -- the column name for signal in the current submission
+        curr_ticker_col -- the column name for tickers in the current submission
+    Returns:
+        prev_week_max_churn -- the maximum churn from previous submissions
+        prev_week_max_turnover -- the maximum turnover from previous submissions
+    """
+    curr_sub_vector: pd.Series = rank_and_fill_signal(
+        universe,
+        curr_sub.reset_index().set_index(curr_ticker_col).sort_index()[curr_signal_col],
+        curr_signal_col,
+    )
+    churn_stats = []
+    turnover_stats = []
+    neutralized_weights = neutral_weight(
+        curr_sub_vector, curr_signal_col, curr_neutralizer, curr_weight
+    )
+    for datestamp in prev_week_subs:
+        prev_sub = prev_week_subs[datestamp]
+        prev_neutralizer = prev_neutralizers[datestamp]
+        prev_weight = prev_sample_weights[datestamp]
+        prev_ticker_col, prev_signal_col = validate_headers_signals(prev_sub)
+        prev_universe = universe.reset_index().set_index(prev_ticker_col)
+        filtered_prev_sub_df, _ = validate_ids_signals(
+            prev_universe.index.to_series(), prev_sub, prev_ticker_col
+        )
+        # in case the previous submission has a different ticker column,
+        # remap the ticker column of prev data to the current ticker column
+        filtered_prev_sub = remap_ticker_col(
+            filtered_prev_sub_df.set_index(prev_ticker_col),
+            universe=prev_universe,
+            ticker_col=curr_ticker_col,
+        )[curr_signal_col]
+        filtered_prev_sub = rank_and_fill_signal(
+            universe=universe,
+            submission=filtered_prev_sub,
+            signal_col=curr_signal_col,
+        )
+        prev_neutralizer = remap_ticker_col(
+            prev_neutralizer,
+            universe=prev_universe,
+            ticker_col=curr_ticker_col,
+        ).filter(like="neutralizer_")
+        prev_weight = remap_ticker_col(
+            prev_weight.to_frame(),
+            universe=prev_universe,
+            ticker_col=curr_ticker_col,
+        )[prev_weight.name]
+        prev_neutralized_weights = neutral_weight(
+            filtered_prev_sub, prev_signal_col, prev_neutralizer, prev_weight
+        )
+        try:
+            churn_val = abs(churn(curr_sub_vector, filtered_prev_sub))
+        except AssertionError as e:
+            if "does not have enough overlapping ids" in str(e):
+                continue
+        try:
+            turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
+        except AssertionError as e:
+            if "does not have enough overlapping ids" in str(e):
+                continue
+        churn_stats.append(churn_val)
+        turnover_stats.append(turnover_val)
+    if len(churn_stats) == 0:
+        prev_week_max_churn = 1.0
+    else:
+        prev_week_max_churn = max(churn_stats)
+    if len(turnover_stats) == 0:
+        prev_week_max_turnover = 1.0
+    else:
+        prev_week_max_turnover = max(turnover_stats)
+    return prev_week_max_churn, prev_week_max_turnover

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools/submissions.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from numerai_tools.scoring import tie_kept_rank
+import logging
 from typing import Tuple, List
 import pandas as pd
@@ -16,12 +17,15 @@ SIGNALS_ALLOWED_ID_COLS = [
     "numerai_ticker",
 ]
 SIGNALS_ALLOWED_PRED_COLS = ["prediction", "signal"]
+SIGNALS_ALLOWED_DATE_COLS = ["friday_date", "date"]
 SIGNALS_MIN_TICKERS = 100
 CRYPTO_ALLOWED_ID_COLS = ["symbol"]
 CRYPTO_ALLOWED_PRED_COLS = ["prediction", "signal"]
 CRYPTO_MIN_TICKERS = 100
+logger = logging.getLogger(__name__)
 def _validate_headers(
     expected_id_cols: List[str], expected_pred_cols: List[str], submission: pd.DataFrame
@@ -58,6 +62,17 @@ def validate_headers_numerai(submission: pd.DataFrame) -> Tuple[str, str]:
 def validate_headers_signals(submission: pd.DataFrame) -> Tuple[str, str]:
+    if "data_type" in submission.columns:
+        logger.warning(
+            "data_type column found in Signals submission. This is deprecated and will be removed in the future. "
+            "Please remove the data_type column from your Signals submission."
+        )
+        date_col = [
+            date_col
+            for date_col in SIGNALS_ALLOWED_DATE_COLS
+            if date_col in list(submission.columns)
+        ]
+        submission = submission.drop(columns=["data_type", *date_col], errors="ignore")
     return _validate_headers(
         SIGNALS_ALLOWED_ID_COLS, SIGNALS_ALLOWED_PRED_COLS, submission
     )
@@ -155,7 +170,7 @@ def clean_predictions(
     predictions: pd.DataFrame,
     id_col: str,
     rank_and_fill: bool,
-) -> pd.Series:
+) -> pd.DataFrame:
     """Prepare predictions for submission to Numerai.
     Filters out ids not in live data, drops duplicates, sets ids as index,
     then optionally ranks (keeping ties) and fills NaNs with 0.5.
@@ -169,6 +184,7 @@ def clean_predictions(
         predictions: pd.DataFrame - the predictions to clean
         id_col: str - the column name of the ids
         rank_and_fill: bool - whether to rank and fill NaNs with 0.5
+        left_join_ids: bool - whether to left join the predictions onto the ids
     """
     assert len(live_ids) > 0, "live_ids must not be empty"
     assert live_ids.isna().sum() == 0, "live_ids must not contain NaNs"
@@ -177,13 +193,15 @@ def clean_predictions(
     # drop null indices
     predictions = predictions[~predictions[id_col].isna()]
     predictions = (
-        predictions
-        # filter out ids not in live data
-        [predictions[id_col].isin(live_ids)]
+        predictions[
+            # filter out ids not in live data
+            predictions[id_col].isin(live_ids)
+        ]
         # drop duplicate ids (keep first)
         .drop_duplicates(subset=id_col, keep="first")
         # set ids as index
-        .set_index(id_col).sort_index()
+        .set_index(id_col)
+        .sort_index()
     )
     # rank and fill with 0.5
     if rank_and_fill:

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/numerai_tools.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: numerai-tools
-Version: 0.4.3
+Version: 0.5.0.dev1
 Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
 Home-page: https://github.com/numerai/numerai-tools
 Maintainer: Numerai

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/setup.py RENAMED Viewed

@@ -1,7 +1,7 @@
 from setuptools import setup
 from setuptools import find_packages
-VERSION = "0.4.3"
+VERSION = "0.5.0.dev1"
 def load(path):

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/tests/test_scoring.py RENAMED Viewed

@@ -22,6 +22,7 @@ from numerai_tools.scoring import (
     filter_sort_index,
     filter_sort_index_many,
     filter_sort_top_bottom,
+    filter_sort_top_bottom_concat,
     alpha,
     meta_portfolio_contribution,
 )
@@ -296,13 +297,12 @@ class TestScoring(unittest.TestCase):
             top_bottom=None,
         )
         np.testing.assert_allclose(
-            filter_sort_top_bottom(self.up, top_bottom=2),
+            filter_sort_top_bottom_concat(self.up, top_bottom=2),
             [0, 1, 3, 4],
         )
         top, bot = filter_sort_top_bottom(
             self.up,
             top_bottom=2,
-            return_concatenated=False,
         )
         np.testing.assert_allclose(top, [3, 4])
         np.testing.assert_allclose(bot, [0, 1])

numerai_tools-0.5.0.dev1/tests/test_signals.py ADDED Viewed

@@ -0,0 +1,139 @@
+import unittest
+import numpy as np
+import pandas as pd  # type: ignore
+from numerai_tools.signals import (
+    churn,
+    turnover,
+    calculate_max_churn_and_turnover,
+)
+from .util import (
+    generate_fake_universe,
+    generate_new_submission,
+)
+class TestSignals(unittest.TestCase):
+    def setUp(self):
+        self.up = pd.Series(list(range(5))).rename("up")
+        self.down = pd.Series(list(reversed(range(5)))).rename("down")
+        self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
+        self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
+        self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
+    def test_churn(self):
+        assert np.isclose(churn(self.up, self.up), 0)
+        assert np.isclose(churn(self.up, self.up_down), 1)
+        assert np.isclose(churn(self.up, self.oscillate), 1)
+        assert np.isclose(churn(self.up, self.down), 2)
+        self.assertRaisesRegex(
+            AssertionError,
+            "s2 must have non-zero standard deviation",
+            churn,
+            self.up,
+            self.constant,
+        )
+    def test_churn_tb(self):
+        tmp = churn(self.up, self.up, top_bottom=2)
+        assert np.isclose(tmp, 0), tmp
+        tmp = churn(self.up, self.up_down, top_bottom=2)
+        assert np.isclose(tmp, 0.5), tmp
+        tmp = churn(self.up, self.oscillate, top_bottom=2)
+        assert np.isclose(tmp, 0.5), tmp
+        tmp = churn(self.up, self.down, top_bottom=2)
+        assert np.isclose(tmp, 1), tmp
+        tmp = churn(self.up, self.constant, top_bottom=2)
+        assert np.isclose(tmp, 0), tmp
+    def test_turnover(self):
+        assert np.isclose(turnover(self.up, self.up), 0)
+        assert np.isclose(turnover(self.up, self.up_down), 3)
+        assert np.isclose(turnover(self.up, self.oscillate), 4.5)
+        assert np.isclose(turnover(self.up, self.down), 6)
+        assert np.isclose(turnover(self.up, self.constant), 3.5)
+    def test_churn_first_submission(self):
+        """
+        Test that the churn function works for the first submission
+        No exceptions should be raised, should return 1
+        """
+        fake_universe = generate_fake_universe("20130308")
+        fake_submission = generate_new_submission(fake_universe)
+        fake_neutralizers = pd.DataFrame(
+            {
+                "neutralizer_1": [0.1] * len(fake_universe),
+                "neutralizer_2": [0.2] * len(fake_universe),
+            },
+            index=fake_universe["numerai_ticker"],
+        )
+        fake_sample_weights = pd.Series(
+            [0.5] * len(fake_universe),
+            index=fake_universe["numerai_ticker"],
+            name="sample_weight",
+        )
+        churn, turnover = calculate_max_churn_and_turnover(
+            curr_sub=fake_submission,
+            curr_neutralizer=fake_neutralizers,
+            curr_weight=fake_sample_weights,
+            prev_week_subs=[],
+            prev_neutralizers={"20240208": fake_neutralizers},
+            prev_sample_weights={"20240208": fake_sample_weights},
+            universe=fake_universe.set_index("numerai_ticker").sort_index(),
+            curr_signal_col="signal",
+            curr_ticker_col="numerai_ticker",
+        )
+        assert np.isclose(churn, 1)
+        assert np.isclose(turnover, 1)
+    def test_churn_handles_different_id_columns(self):
+        """
+        Test that the churn function works when
+        previous submission has different id columns.
+        """
+        fake_universe = generate_fake_universe("20130308")
+        fake_submission = generate_new_submission(fake_universe, legacy_headers=True)
+        new_fake_universe = generate_fake_universe(
+            date_value="20130308", ticker_col="ticker"
+        )
+        fake_universe["ticker"] = new_fake_universe["ticker"]
+        prev_submission = fake_submission.copy()
+        fake_neutralizers = pd.DataFrame(
+            {
+                "neutralizer_1": [0.1] * len(fake_universe),
+                "neutralizer_2": [0.2] * len(fake_universe),
+            },
+            index=fake_universe["numerai_ticker"],
+        )
+        fake_sample_weights = pd.Series(
+            [0.5] * len(fake_universe),
+            index=fake_universe["numerai_ticker"],
+            name="sample_weight",
+        )
+        # switch out the numerai_ticke col in-place
+        prev_submission["numerai_ticker"] = new_fake_universe["ticker"]
+        prev_submission.rename(columns={"numerai_ticker": "ticker"}, inplace=True)
+        prev_neutralizers = fake_neutralizers.copy()
+        prev_neutralizers.index = new_fake_universe["ticker"]
+        prev_neutralizers.index.name = "ticker"
+        prev_sample_weights = fake_sample_weights.copy()
+        prev_sample_weights.index = new_fake_universe["ticker"]
+        prev_sample_weights.index.name = "ticker"
+        churn, turnover = calculate_max_churn_and_turnover(
+            curr_sub=fake_submission,
+            curr_neutralizer=fake_neutralizers,
+            curr_weight=fake_sample_weights,
+            prev_week_subs={"20240208": prev_submission},
+            prev_neutralizers={"20240208": prev_neutralizers},
+            prev_sample_weights={"20240208": prev_sample_weights},
+            universe=fake_universe.set_index("numerai_ticker").sort_index(),
+            curr_signal_col="signal",
+            curr_ticker_col="numerai_ticker",
+        )
+        assert np.isclose(churn, 0)
+        assert np.isclose(turnover, 0)
+if __name__ == "__main__":
+    unittest.main()

{numerai_tools-0.4.3 → numerai_tools-0.5.0.dev1}/tests/test_submissions.py RENAMED Viewed

@@ -155,6 +155,18 @@ class TestSubmissions(unittest.TestCase):
                 sub[[sub.columns[1]]],
             )
+    def test_validate_headers_signals_data_type_and_date_col(self):
+        fake_sub = generate_submission(self.ids, "ticker", "signal")
+        fake_sub["data_type"] = "signals"
+        fake_sub["friday_date"] = "2023-01-01"
+        with self.assertLogs(level="WARNING") as cm:
+            assert validate_headers_signals(fake_sub) == ("ticker", "signal")
+        self.assertIn(
+            "WARNING:numerai_tools.submissions:data_type column found in Signals submission. This is deprecated and will be removed in the future. "
+            "Please remove the data_type column from your Signals submission.",
+            cm.output[0],
+        )
     def test_validate_headers_crypto(self):
         for sub in self.crypto_subs:
             assert validate_headers_crypto(sub) == tuple(sub.columns)
@@ -432,7 +444,7 @@ class TestSubmissions(unittest.TestCase):
         assert not cleaned_predictions.index.duplicated().any()
-def generate_ids(id_length: int, num_rows: int) -> List[str]:
+def generate_ids(id_length: int, num_rows: int) -> pd.Series:
     """Generates a given number of unique ascii-valued strings of a given length.
     Arguments:

numerai_tools-0.4.3/numerai_tools/signals.py DELETED Viewed

@@ -1,72 +0,0 @@
-from numerai_tools.scoring import (
-    filter_sort_index,
-    filter_sort_top_bottom,
-    spearman_correlation,
-)
-from typing import List, Tuple, Union, Optional
-import pandas as pd
-def churn(
-    s1: pd.Series,
-    s2: pd.Series,
-    top_bottom: Optional[int] = None,
-) -> float:
-    """Calculate the churn between two series. Churn is the proportion of elements
-    that are different between the two series.
-    For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
-    If top_bottom is provided, the churn is calculated as the average of the % of
-    tickers that stay in the top and bottom predictions. This is only relevant when
-    the series are rank signals and not portfolio weights.
-    Arguments:
-        s1: pd.Series - the first series to compare
-        s2: pd.Series - the second series to compare
-        top_bottom: Optional[int] - the number of top and bottom predictions to use
-                                    when calculating the correlation. Results in
-                                    2*top_bottom predictions.
-    Returns:
-        float - the churn between the two series
-    """
-    if top_bottom is not None and top_bottom > 0:
-        s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom, False)
-        s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom, False)
-        top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
-        bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
-        avg_overlap = (top_overlap + bot_overlap) / 2
-        return 1 - avg_overlap
-    s1, s2 = filter_sort_index(s1, s2)
-    assert s1.std() > 0, "s1 must have non-zero standard deviation"
-    assert s2.std() > 0, "s2 must have non-zero standard deviation"
-    return 1 - spearman_correlation(s1, s2)
-def turnover(
-    s1: pd.Series,
-    s2: pd.Series,
-):
-    """Calculate the turnover between two series. Turnover is the total change in weights between
-    the two series divided by 2.
-    For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
-    and calculate turnover as the absolute total difference between the two series divided by 2.
-    This is only relevant when the series are portfolio weights and not rank signals.
-    Arguments:
-        s1: pd.Series - the first series to compare
-        s2: pd.Series - the second series to compare
-        top_bottom: Optional[int] - the number of top and bottom predictions to use
-                                    when calculating the correlation. Results in
-                                    2*top_bottom predictions.
-    Returns:
-        float - the turnover between the two series
-    """
-    s1, s2 = filter_sort_index(s1, s2)
-    turnover = (s1 - s2).abs().sum() / 2
-    return turnover

numerai_tools-0.4.3/tests/test_signals.py DELETED Viewed

@@ -1,51 +0,0 @@
-import unittest
-import numpy as np
-import pandas as pd  # type: ignore
-from numerai_tools.signals import churn, turnover
-class TestSignals(unittest.TestCase):
-    def setUp(self):
-        self.up = pd.Series(list(range(5))).rename("up")
-        self.down = pd.Series(list(reversed(range(5)))).rename("down")
-        self.up_down = pd.Series([0, 1, 2, 1, 0]).rename("up_down")
-        self.oscillate = pd.Series([1, 0, 1, 0, 1]).rename("oscillate")
-        self.constant = pd.Series([1, 1, 1, 1, 1]).rename("pos_neg")
-    def test_churn(self):
-        assert np.isclose(churn(self.up, self.up), 0)
-        assert np.isclose(churn(self.up, self.up_down), 1)
-        assert np.isclose(churn(self.up, self.oscillate), 1)
-        assert np.isclose(churn(self.up, self.down), 2)
-        self.assertRaisesRegex(
-            AssertionError,
-            "s2 must have non-zero standard deviation",
-            churn,
-            self.up,
-            self.constant,
-        )
-    def test_churn_tb(self):
-        tmp = churn(self.up, self.up, top_bottom=2)
-        assert np.isclose(tmp, 0), tmp
-        tmp = churn(self.up, self.up_down, top_bottom=2)
-        assert np.isclose(tmp, 0.5), tmp
-        tmp = churn(self.up, self.oscillate, top_bottom=2)
-        assert np.isclose(tmp, 0.5), tmp
-        tmp = churn(self.up, self.down, top_bottom=2)
-        assert np.isclose(tmp, 1), tmp
-        tmp = churn(self.up, self.constant, top_bottom=2)
-        assert np.isclose(tmp, 0), tmp
-    def test_turnover(self):
-        assert np.isclose(turnover(self.up, self.up), 0)
-        assert np.isclose(turnover(self.up, self.up_down), 3)
-        assert np.isclose(turnover(self.up, self.oscillate), 4.5)
-        assert np.isclose(turnover(self.up, self.down), 6)
-        assert np.isclose(turnover(self.up, self.constant), 3.5)
-if __name__ == "__main__":
-    unittest.main()