PyPI - numerai-tools - Versions diffs - 0.4.2.dev1__tar.gz → 0.5.0__tar.gz - Mend

numerai-tools 0.4.2.dev1tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

numerai_tools-0.5.0/PKG-INFO +40 -0
numerai_tools-0.5.0/README.md +15 -0
{numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/scoring.py +111 -92
numerai_tools-0.5.0/numerai_tools/signals.py +206 -0
numerai_tools-0.5.0/numerai_tools/submissions.py +348 -0
numerai_tools-0.5.0/pyproject.toml +52 -0
numerai_tools-0.4.2.dev1/PKG-INFO +0 -22
numerai_tools-0.4.2.dev1/README.md +0 -2
numerai_tools-0.4.2.dev1/numerai_tools/signals.py +0 -72
numerai_tools-0.4.2.dev1/numerai_tools/submissions.py +0 -191
numerai_tools-0.4.2.dev1/numerai_tools.egg-info/PKG-INFO +0 -22
numerai_tools-0.4.2.dev1/numerai_tools.egg-info/SOURCES.txt +0 -16
numerai_tools-0.4.2.dev1/numerai_tools.egg-info/dependency_links.txt +0 -1
numerai_tools-0.4.2.dev1/numerai_tools.egg-info/requires.txt +0 -4
numerai_tools-0.4.2.dev1/numerai_tools.egg-info/top_level.txt +0 -1
numerai_tools-0.4.2.dev1/setup.cfg +0 -4
numerai_tools-0.4.2.dev1/setup.py +0 -47
numerai_tools-0.4.2.dev1/tests/test_scoring.py +0 -346
numerai_tools-0.4.2.dev1/tests/test_signals.py +0 -51
numerai_tools-0.4.2.dev1/tests/test_submissions.py +0 -486
{numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/LICENSE +0 -0
{numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/__init__.py +0 -0
{numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/py.typed +0 -0

numerai_tools-0.5.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,40 @@
+Metadata-Version: 2.3
+Name: numerai-tools
+Version: 0.5.0
+Summary: A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+License: MIT
+Author: Numerai Engineering
+Author-email: engineering@numer.ai
+Requires-Python: >=3.11
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Scientific/Engineering
+Requires-Dist: numpy (>=2.0.0,<3.0.0)
+Requires-Dist: pandas (>=2.2.2,<3.0.0)
+Requires-Dist: scikit-learn (>=1.5.0,<2.0.0)
+Requires-Dist: scipy (>=1.13.0,<2.0.0)
+Project-URL: Documentation, https://docs.numer.ai/
+Project-URL: Homepage, https://numer.ai
+Project-URL: Repository, https://github.com/numerai/numerai-tools
+Description-Content-Type: text/markdown
+# numerai-tools
+A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+## Installation
+```
+pip install numerai-tools
+```
+## Structure
+- The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
+- The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
+- The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.

numerai_tools-0.5.0/README.md ADDED Viewed

@@ -0,0 +1,15 @@
+# numerai-tools
+A collection of open-source tools to help interact with Numerai, model data, and automate submissions.
+## Installation
+```
+pip install numerai-tools
+```
+## Structure
+- The `scoring.py` module contains critical functions used to score submissions. We use this code in our scoring system system. Leverage this to optimize your models for the tournaments.
+- The `submissions.py` module provides helper functions to ensure your submissions are valid and formatted correctly. Use this in your automated prediction pipelines to ensure uploads don't fail.
+- The `signals.py` module provides code specific to Numerai Signals such as churn and turnover. Use this to ensure your Signals submissions are properly formatted.

{numerai_tools-0.4.2.dev1 → numerai_tools-0.5.0}/numerai_tools/scoring.py RENAMED Viewed

@@ -1,8 +1,8 @@
-from typing import List, Tuple, Union, Optional, TypeVar
+from typing import List, Literal, Tuple, Union, Optional, TypeVar, cast, Any
 import numpy as np
-import pandas as pd  # type: ignore
-from scipy import stats  # type: ignore
+import pandas as pd
+from scipy import stats
 from sklearn.preprocessing import OneHotEncoder  # type: ignore
@@ -14,6 +14,7 @@ DEFAULT_MAX_FILTERED_INDEX_RATIO = 0.2
 S1 = TypeVar("S1", bound=Union[pd.DataFrame, pd.Series])
 S2 = TypeVar("S2", bound=Union[pd.DataFrame, pd.Series])
+RANK_METHOD_TYPE = Literal["average", "min", "max", "first", "dense"]
 def filter_sort_index(
@@ -43,12 +44,13 @@ def filter_sort_index(
         "s2 does not have enough overlapping ids with s1,"
         f" must have >= {round(1-max_filtered_ratio,2)*100}% overlapping ids"
     )
-    return s1.loc[ids].sort_index(), s2.loc[ids].sort_index()
+    return cast(S1, s1.loc[ids].sort_index()), cast(S2, s2.loc[ids].sort_index())
 def filter_sort_index_many(
-    inputs: List[S1], max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO
-) -> List[S1]:
+    inputs: List[Any],
+    max_filtered_ratio: float = DEFAULT_MAX_FILTERED_INDEX_RATIO,
+) -> List[Any]:
     """Filters the indices of the given list of series to match each other,
     then sorts the indices, then checks that we didn't filter too many indices
     before returning the filtered and sorted series.
@@ -74,43 +76,72 @@ def filter_sort_index_many(
 def filter_sort_top_bottom(
-    s: pd.Series, top_bottom: int, return_concatenated: bool = True
-) -> Union[pd.Series, Tuple[pd.Series, pd.Series]]:
+    s: pd.Series, top_bottom: int
+) -> Tuple[pd.Series, pd.Series]:
     """Filters the series according to the top n and bottom n values
-    then sorts the index and returns the filtered and sorted series.
+    then sorts the index and returns two filtered and sorted series
+    for the top and bottom values respectively.
     Arguments:
         s: pd.Series - the data to filter and sort
         top_bottom: int - the number of top n and bottom n values to keep
     Returns:
-        pd.Series - the filtered and sorted data
+        Tuple[pd.Series, pd.Series] - the filtered and sorted top and bottom series respectively
     """
     tb_idx = np.argsort(s, kind="stable")
     bot = s.iloc[tb_idx[:top_bottom]]
     top = s.iloc[tb_idx[-top_bottom:]]
-    if return_concatenated:
-        return pd.concat([top, bot]).sort_index()
-    else:
-        return top.sort_index(), bot.sort_index()
+    return top.sort_index(), bot.sort_index()
-def rank(df: pd.DataFrame, method: str = "average") -> pd.DataFrame:
-    """Percentile rank each column of a pandas DataFrame, centering values around 0.5
+def filter_sort_top_bottom_concat(s: pd.Series, top_bottom: int) -> pd.Series:
+    """Similar to filter_sort_top_bottom, but concatenates the top and bottom series
+    into 1 series and then sorts the index.
     Arguments:
-        df: pd.DataFrame - the data to rank
+        s: pd.Series - the data to filter and sort
+        top_bottom: int - the number of top n and bottom n values to keep
+    Returns:
+        pd.Series - the concatenated and sorted series of top and bottom values
+    """
+    top, bot = filter_sort_top_bottom(s, top_bottom)
+    return pd.concat([top, bot]).sort_index()
+def rank_series(s: pd.Series, method: RANK_METHOD_TYPE = "average") -> pd.Series:
+    """Percentile rank a pandas Series, centering values around 0.5.
+    Arguments:
+        s: pd.Series - the data to rank
         method: str - the pandas ranking method to use, options:
             'average' (default) - keeps ties
             'first' - breaks ties by index
     Returns:
-        pd.DataFrame - the ranked DataFrame
+        pd.Series - the ranked Series
     """
-    assert np.array_equal(df.index.sort_values(), df.index), "unsorted index found"
-    return df.apply(
-        lambda series: (series.rank(method=method).values - 0.5) / series.count()
-    )
+    assert np.array_equal(s.index.sort_values(), s.index), "unsorted index found"
+    return (s.rank(method=method) - 0.5) / s.count()
+def rank(s: S1, method: RANK_METHOD_TYPE = "average") -> S1:
+    """Percentile rank each columns or series, centering values around 0.5
+    Arguments:
+        s: pd.DataFrame | pd.Series - the data to rank
+        method: str - the pandas ranking method to use, options:
+            'average' (default) - keeps ties
+            'first' - breaks ties by index
+    Returns:
+        pd.DataFrame | pd.Series - the ranked input data
+    """
+    if isinstance(s, pd.Series):
+        return cast(S1, rank_series(s, method))
+    else:
+        return s.apply(lambda series: rank(series, method=method))
 def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
@@ -118,9 +149,9 @@ def tie_broken_rank(df: pd.DataFrame) -> pd.DataFrame:
     return rank(df, "first")
-def tie_kept_rank(df: pd.DataFrame) -> pd.DataFrame:
+def tie_kept_rank(s: S1) -> S1:
     """Rank columns, but keep ties."""
-    return rank(df, "average")
+    return cast(S1, rank(s, "average"))
 def min_max_normalize(s: pd.Series) -> pd.Series:
@@ -133,14 +164,14 @@ def variance_normalize(df: pd.DataFrame) -> pd.DataFrame:
     return df / np.std(df, axis=0)
-def weight_normalize(df: pd.DataFrame) -> pd.DataFrame:
-    """Scale a df such that all columns have absolute value sum == 1."""
-    return df / df.abs().sum(axis=0)
+def weight_normalize(s: S1) -> S1:
+    """Scale a input such that all columns have absolute value sum == 1."""
+    return cast(S1, s / s.abs().sum(axis=0))
-def center(df: pd.DataFrame) -> pd.DataFrame:
-    """Shift the df such that all columns have mean == 0."""
-    return df - df.mean()
+def center(s: S1) -> S1:
+    """Shift the input such that all columns have mean == 0."""
+    return cast(S1, s - s.mean())
 def standardize(df: pd.DataFrame) -> pd.DataFrame:
@@ -179,7 +210,7 @@ def pearson_correlation(
     target: pd.Series, predictions: pd.Series, top_bottom: Optional[int] = None
 ) -> float:
     if top_bottom is not None and top_bottom > 0:
-        predictions = filter_sort_top_bottom(predictions, top_bottom)
+        predictions = filter_sort_top_bottom_concat(predictions, top_bottom)
         target, predictions = filter_sort_index(
             target, predictions, (1 - top_bottom / len(target))
         )
@@ -205,7 +236,7 @@ def power(df: pd.DataFrame, p: float) -> pd.DataFrame:
     """
     assert not df.isna().any().any(), "Data contains NaNs"
     assert np.array_equal(df.index.sort_values(), df.index), "Index is not sorted"
-    result = np.sign(df) * np.abs(df) ** p
+    result = cast(pd.DataFrame, np.sign(df) * np.abs(df) ** p)
     assert ((result.std() == 0) | (result.corrwith(df) >= 0.9)).all()
     return result
@@ -221,7 +252,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame:
         pd.DataFrame - the gaussianized data
     """
     assert np.array_equal(df.index.sort_values(), df.index)
-    return df.apply(lambda series: stats.norm.ppf(series))
+    return df.apply(lambda series: cast(np.ndarray, stats.norm.ppf(series)))
 def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray:
@@ -303,7 +334,7 @@ def correlation_contribution(
     m = gaussian(tie_kept_rank(meta_model.to_frame()))[meta_model.name].values
     # orthogonalize predictions wrt meta model
-    neutral_preds = orthogonalize(p, m)
+    neutral_preds = orthogonalize(p, cast(np.ndarray, m))
     # convert target to buckets [-2, -1, 0, 1, 2]
     if (live_targets >= 0).all() and (live_targets <= 1).all():
@@ -314,9 +345,9 @@ def correlation_contribution(
         # filter each column to its top and bottom n predictions
         neutral_preds_df = pd.DataFrame(
             neutral_preds, columns=predictions.columns, index=predictions.index
-        ).apply(lambda p: filter_sort_top_bottom(p, top_bottom))
-        # create a dataframe for targets to match the filtered predictions
-        live_targets = (
+        ).apply(lambda p: filter_sort_top_bottom_concat(p, top_bottom))
+        mmc_matrix = (
+            # create a dataframe for targets to match the filtered predictions
             neutral_preds_df.apply(
                 lambda p: filter_sort_index(
                     p,
@@ -326,19 +357,15 @@ def correlation_contribution(
             )
             .fillna(0)
             .T.values
-        )
-        # fillna with 0 so we don't get NaNs in the dot product
-        neutral_preds = neutral_preds_df.fillna(0).values
-    # multiply target and neutralized predictions
-    # this is equivalent to covariance b/c mean = 0
-    mmc = live_targets @ neutral_preds
-    if top_bottom is not None and top_bottom > 0:
+            # then fill NaNs with 0 so we don't get NaNs in the dot product
+            #  and mutiply target w/ neutral preds to get MMC
+        ) @ neutral_preds_df.fillna(0).values
         # only the diagonal is the proper score
-        mmc = np.diag(mmc) / (top_bottom * 2)
+        mmc = np.diag(mmc_matrix) / (top_bottom * 2)
     else:
-        mmc /= len(live_targets)
+        # multiply target and neutralized predictions
+        # this is equivalent to covariance b/c mean = 0
+        mmc = (live_targets @ neutral_preds) / len(live_targets)
     return pd.Series(mmc, index=predictions.columns)
@@ -461,7 +488,7 @@ def numerai_corr(
     Returns:
         pd.Series - the resulting correlation scores for each column in predictions
     """
-    targets = targets - targets.mean()
+    targets = center(targets)
     targets, predictions = filter_sort_index(
         targets, predictions, max_filtered_index_ratio
     )
@@ -522,21 +549,33 @@ def max_feature_correlation(
     feature_correlations = features.apply(
         lambda f: pearson_correlation(f, s, top_bottom)
     )
-    feature_correlations = np.abs(feature_correlations)
+    feature_correlations = feature_correlations.abs()
     max_feature = feature_correlations.idxmax()
     max_corr = feature_correlations[max_feature]
-    return max_feature, max_corr
+    return str(max_feature), max_corr
 def generate_neutralized_weights(
-    predictions: pd.Series,
+    predictions: pd.DataFrame,
     neutralizers: pd.DataFrame,
     sample_weights: pd.Series,
-) -> pd.Series:
-    neutral_preds = predictions - (
-        neutralizers @ (neutralizers.T @ (sample_weights * predictions))
+    center_and_normalize: bool = False,
+) -> pd.DataFrame:
+    assert not predictions.isna().any().any(), "Predictions contain NaNs"
+    assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
+    assert not sample_weights.isna().any(), "Weights contain NaNs"
+    ranked_predictions = tie_kept_rank__gaussianize__pow_1_5(predictions)
+    ranked_predictions, neutralizers, sample_weights = filter_sort_index_many(
+        [ranked_predictions, neutralizers, sample_weights]
+    )
+    neutral_weights = ranked_predictions.apply(
+        lambda s_prime: (
+            s_prime - neutralizers @ (neutralizers.T @ (sample_weights * s_prime))
+        )
+        * sample_weights
     )
-    neutral_weights = neutral_preds * sample_weights
+    if center_and_normalize:
+        neutral_weights = weight_normalize(center(neutral_weights))
     return neutral_weights
@@ -557,18 +596,9 @@ def alpha(
         sample_weights: pd.Series - the universe sampling weights
         targets: pd.Series - the live targets to evaluate against
     """
-    assert not predictions.isna().any().any(), "Predictions contain NaNs"
-    assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
-    assert not sample_weights.isna().any(), "Weights contain NaNs"
-    predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
-        [predictions, neutralizers, sample_weights, targets]
-    )
-    weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
-        lambda s_prime: generate_neutralized_weights(
-            s_prime, neutralizers, sample_weights
-        )
-    )
+    targets = center(targets)
+    predictions, targets = filter_sort_index(predictions, targets)
+    weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
     alpha_scores = weights.apply(lambda w: w @ targets) / len(targets)
     return alpha_scores
@@ -593,33 +623,22 @@ def meta_portfolio_contribution(
         sample_weights: pd.Series - the universe sampling weights
         targets: pd.Series - the live targets to evaluate against
     """
-    assert not predictions.isna().any().any(), "Predictions contain NaNs"
-    assert not neutralizers.isna().any().any(), "Normalization factors contain NaNs"
-    assert not sample_weights.isna().any(), "Weights contain NaNs"
-    predictions, neutralizers, sample_weights, targets = filter_sort_index_many(
-        [predictions, neutralizers, sample_weights, targets]
-    )
+    targets = center(targets)
+    predictions, targets = filter_sort_index(predictions, targets)
     stake_weights = weight_normalize(stakes.fillna(0))
     assert np.isclose(stake_weights.sum(), 1), "Stakes must sum to 1"
-    weights = tie_kept_rank__gaussianize__pow_1_5(predictions).apply(
-        lambda s_prime: generate_neutralized_weights(
-            s_prime, neutralizers, sample_weights
-        )
-    )
-    w = weights[stakes.index].values
-    s = stake_weights.values
-    t = targets.values
+    weights = generate_neutralized_weights(predictions, neutralizers, sample_weights)
+    w = cast(np.ndarray, weights[stakes.index].values)
+    s = cast(np.ndarray, stake_weights.values)
+    t = cast(np.ndarray, targets.values)
     swp = w @ s
     swp = swp - swp.mean()
-    swp_abs_sum = np.sum(np.abs(swp))
+    l1_norm = np.sum(np.abs(swp))
+    l1_norm_squared = np.power(l1_norm, 2)
     swp_sign = np.sign(swp)
-    alpha_unnormalized_swp_grad = (
-        1
-        / np.power(swp_abs_sum, 2)
-        * (swp_abs_sum * t - swp_sign * np.dot(swp, t)).reshape(-1, 1)
-    )
-    zero_mean_jac_vec_prod = (
-        alpha_unnormalized_swp_grad - alpha_unnormalized_swp_grad.mean()
-    )
-    mpc = (w.T @ zero_mean_jac_vec_prod).squeeze()
+    swp_alpha = np.dot(swp, t)
+    directional_gradient = l1_norm * t - swp_sign * swp_alpha
+    jacobian_vector_product = directional_gradient.reshape(-1, 1) / l1_norm_squared
+    centered_jacobian = jacobian_vector_product - jacobian_vector_product.mean()
+    mpc = (w.T @ centered_jacobian).squeeze()
     return pd.Series(mpc, index=stakes.index)

numerai_tools-0.5.0/numerai_tools/signals.py ADDED Viewed

@@ -0,0 +1,206 @@
+from typing import Tuple, Optional
+from numerai_tools.scoring import (
+    filter_sort_index,
+    filter_sort_top_bottom,
+    spearman_correlation,
+    generate_neutralized_weights,
+)
+from numerai_tools.submissions import (
+    validate_submission_signals,
+    clean_submission,
+)
+import pandas as pd
+def churn(
+    s1: pd.Series,
+    s2: pd.Series,
+    top_bottom: Optional[int] = None,
+) -> float:
+    """Calculate the churn between two series. Churn is the proportion of elements
+    that are different between the two series.
+    For 2 given series with overlapping indices, churn is 1 - Spearman Correlation.
+    If top_bottom is provided, the churn is calculated as the average of the % of
+    tickers that stay in the top and bottom predictions. This is only relevant when
+    the series are rank signals and not portfolio weights.
+    Arguments:
+        s1: pd.Series - the first series to compare
+        s2: pd.Series - the second series to compare
+        top_bottom: Optional[int] - the number of top and bottom predictions to use
+                                    when calculating the correlation. Results in
+                                    2*top_bottom predictions.
+    Returns:
+        float - the churn between the two series
+    """
+    if top_bottom is not None and top_bottom > 0:
+        s1_top, s1_bot = filter_sort_top_bottom(s1, top_bottom)
+        s2_top, s2_bot = filter_sort_top_bottom(s2, top_bottom)
+        top_overlap = len(s1_top.index.intersection(s2_top.index)) / top_bottom
+        bot_overlap = len(s1_bot.index.intersection(s2_bot.index)) / top_bottom
+        avg_overlap = (top_overlap + bot_overlap) / 2
+        return 1 - avg_overlap
+    s1, s2 = filter_sort_index(s1, s2)
+    assert s1.std() > 0, "s1 must have non-zero standard deviation"
+    assert s2.std() > 0, "s2 must have non-zero standard deviation"
+    return 1 - spearman_correlation(s1, s2)
+def turnover(
+    s1: pd.Series,
+    s2: pd.Series,
+):
+    """Calculate the turnover between two series. Turnover is the total change in weights between
+    the two series divided by 2.
+    For 2 given series with overlapping indices, join the series on index, fill nans with zeroes
+    and calculate turnover as the absolute total difference between the two series divided by 2.
+    This is only relevant when the series are portfolio weights and not rank signals.
+    Arguments:
+        s1: pd.Series - the first series to compare
+        s2: pd.Series - the second series to compare
+    Returns:
+        float - the turnover between the two series
+    """
+    s1, s2 = filter_sort_index(s1, s2)
+    turnover = (s1 - s2).abs().sum() / 2
+    return turnover
+def calculate_max_churn_and_turnover(
+    curr_sub: pd.Series,
+    curr_neutralizer: pd.DataFrame,
+    curr_sample_weight: pd.Series,
+    prev_subs: dict[str, pd.Series],
+    prev_neutralizers: dict[str, pd.DataFrame],
+    prev_sample_weights: dict[str, pd.Series],
+) -> Tuple[float, float]:
+    """Calculate the maximum churn and turnover of the current submission with respect to previous submissions.
+    This function iterates over previous submissions and calculates churn and turnover for each submission
+    against the current submission. It expects the following:
+        - all submissions, neutralizers, and sample weights are indexed on the same type of tickers/IDs
+          (e.g. all numerai_ticker, or all composite_figi, or all etc.)
+        - neutralizers and sample weights cover the full universe of their respective eras. This means you
+          should avoid removing rows from neutralizers or sample weights before passing them to this function.
+    In a live submission environment your submissions are joined on their respective full universes, ranked,
+    and then any NaNs are filled with 0.5 before calculating churn and turnover. So, if you provide filtered
+    neutralizers or sample weights, your locally calculated churn and turnover may not match the live value.
+    Arguments:
+        curr_sub: pd.Series - current-era submission indexed on tickers/ids
+        curr_neutralizer: pd.DataFrame
+            - current-era neutralizers indexed on the same type of tickers/ids.
+              We expect these to cover the full universe for the current era.
+        curr_sample_weight: pd.Series
+            - current-era sample weights indexed on the same type of tickers/ids.
+              We expect these to cover the full universe for the current era.
+        prev_subs: dict[str, pd.Series]
+            - a dictionary mapping datestamps to submissions, where each submission is a
+              Series indexed on the same type of tickers/ids as the current
+              submission. To calculate churn and turnover for a live submission,
+              use the most recent 5 submissions. For diagnostics, just provide the
+              last 1 era.
+        prev_neutralizers: dict[str, pd.DataFrame]
+            - a dictionary mapping datestamps to neutralizers DataFrames where each neutralizers
+              DataFrame is indexed on the same type of tickers/ids as the current submission.
+              We expect each of these to cover the full universe of their respective eras.
+        prev_sample_weights: dict[str, pd.Series]
+            - a dictionary mapping datestamps to sample weights where each sample weights
+              Series is indexed on the same type of tickers/ids as the current submission.
+              We expect each of these to cover the full universe of their respective eras.
+    Returns:
+        prev_week_max_churn -- the maximum churn from previous submissions
+        prev_week_max_turnover -- the maximum turnover from previous submissions
+    """
+    (
+        curr_ticker_col,
+        curr_signal_col,
+        _,
+        curr_sub_df,
+        _,
+    ) = validate_submission_signals(
+        universe=curr_sample_weight.index.to_frame(),
+        submission=curr_sub.reset_index(),
+    )
+    curr_sub = clean_submission(
+        universe=curr_sample_weight.index.to_frame(),
+        submission=curr_sub_df,
+        src_id_col=curr_ticker_col,
+        src_signal_col=curr_signal_col,
+        rank_and_fill=True,
+    )
+    churn_stats = []
+    turnover_stats = []
+    neutralized_weights = generate_neutralized_weights(
+        curr_sub.to_frame(),
+        curr_neutralizer,
+        curr_sample_weight,
+        center_and_normalize=True,
+    )[curr_sub.name]
+    for datestamp in prev_subs:
+        prev_sub = prev_subs[datestamp]
+        prev_neutralizer = prev_neutralizers[datestamp]
+        prev_sample_weight = prev_sample_weights[datestamp]
+        (
+            prev_ticker_col,
+            prev_signal_col,
+            _,
+            prev_sub_df,
+            _,
+        ) = validate_submission_signals(
+            universe=prev_sample_weight.index.to_frame(),
+            submission=prev_sub.reset_index(),
+        )
+        prev_sub = clean_submission(
+            universe=prev_sample_weight.index.to_frame(),
+            submission=prev_sub_df,
+            src_id_col=prev_ticker_col,
+            src_signal_col=prev_signal_col,
+            dst_id_col=curr_ticker_col,
+            dst_signal_col=curr_signal_col,
+            rank_and_fill=True,
+        )
+        prev_neutralized_weights = generate_neutralized_weights(
+            prev_sub.to_frame(),
+            prev_neutralizer,
+            prev_sample_weight,
+            center_and_normalize=True,
+        )[prev_sub.name]
+        try:
+            churn_val = abs(churn(curr_sub, prev_sub))
+        except AssertionError as e:
+            if "does not have enough overlapping ids" in str(e):
+                continue
+        try:
+            turnover_val = abs(turnover(neutralized_weights, prev_neutralized_weights))
+        except AssertionError as e:
+            if "does not have enough overlapping ids" in str(e):
+                continue
+        churn_stats.append(churn_val)
+        turnover_stats.append(turnover_val)
+    if len(churn_stats) == 0:
+        prev_week_max_churn = 1.0
+    else:
+        prev_week_max_churn = max(churn_stats)
+    if len(turnover_stats) == 0:
+        prev_week_max_turnover = 1.0
+    else:
+        prev_week_max_turnover = max(turnover_stats)
+    return prev_week_max_churn, prev_week_max_turnover

numerai-tools 0.4.2.dev1__tar.gz → 0.5.0__tar.gz

numerai-tools 0.4.2.dev1tar.gz → 0.5.0tar.gz