alpha-engine-lib 0.48.0__tar.gz → 0.49.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/PKG-INFO +6 -1
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/README.md +1 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/pyproject.toml +5 -1
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/__init__.py +1 -1
- alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/__init__.py +22 -0
- alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/dsr.py +278 -0
- alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/expectancy.py +161 -0
- alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/information_coefficient.py +149 -0
- alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/multiple_testing.py +48 -0
- alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/risk_matched_benchmark.py +305 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/PKG-INFO +6 -1
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/SOURCES.txt +11 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/requires.txt +5 -0
- alpha_engine_lib-0.49.0/tests/test_quant_stats_dsr.py +95 -0
- alpha_engine_lib-0.49.0/tests/test_quant_stats_expectancy.py +102 -0
- alpha_engine_lib-0.49.0/tests/test_quant_stats_information_coefficient.py +100 -0
- alpha_engine_lib-0.49.0/tests/test_quant_stats_multiple_testing.py +42 -0
- alpha_engine_lib-0.49.0/tests/test_quant_stats_risk_matched_benchmark.py +140 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/setup.cfg +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/agent_schemas.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/alerts.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/anthropic_payload.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/arcticdb.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/artifact_freshness.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/collector_results.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/cost.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/dates.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/decision_capture.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/ec2_spot.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/email_sender.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/eval_artifacts.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/http_retry.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/locks.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/logging.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/model_pricing.yaml +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pillars.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/__init__.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/read.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/registry.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/templates.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/preflight.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/__init__.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/attribution.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/factor_risk.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/factor_risk_xs.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/returns.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/risk_measures.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/riskstats.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/__init__.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/db.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/embeddings.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/migrations/0001_content_tsv.sql +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/rerank.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/retrieval.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/schema.sql +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/reconcile.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/secrets.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/sources/__init__.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/sources/protocols.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/ssm_dispatcher.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/ssm_log_capture.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/telegram.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/trading_calendar.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/transparency.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/transparency_inventory.yaml +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/universe.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/dependency_links.txt +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/top_level.txt +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_agent_schemas.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_alerts.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_anthropic_payload.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_arcticdb.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_artifact_freshness.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_collector_results.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_cost.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_dates.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_decision_capture.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_ec2_spot.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_email_sender.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_eval_artifacts.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_http_retry.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_locks.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_logging.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pillars.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pipeline_status_read.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pipeline_status_registry.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pipeline_status_templates.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_preflight.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_attribution.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_factor_risk.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_factor_risk_xs.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_returns.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_risk_measures.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_riskstats.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_rag.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_rag_rerank.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_rag_retrieval_hybrid.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_reconcile.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_secrets.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_sources_protocols.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_ssm_dispatcher.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_ssm_log_capture.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_telegram.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_trading_calendar.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_transparency.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_universe.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_version_bump_workflow.py +0 -0
- {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_version_pin.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alpha-engine-lib
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.49.0
|
|
4
4
|
Summary: Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, S3-conditional-PUT writer locks, and bounded-backoff HTTP retry. Full surface documented in README.
|
|
5
5
|
Author: Brian McMahon
|
|
6
6
|
License: Proprietary
|
|
@@ -20,6 +20,10 @@ Provides-Extra: quant-xs
|
|
|
20
20
|
Requires-Dist: numpy>=1.24; extra == "quant-xs"
|
|
21
21
|
Requires-Dist: pandas>=2.0; extra == "quant-xs"
|
|
22
22
|
Requires-Dist: scikit-learn>=1.0; extra == "quant-xs"
|
|
23
|
+
Provides-Extra: quant-stats
|
|
24
|
+
Requires-Dist: numpy>=1.24; extra == "quant-stats"
|
|
25
|
+
Requires-Dist: pandas>=2.0; extra == "quant-stats"
|
|
26
|
+
Requires-Dist: scipy>=1.7; extra == "quant-stats"
|
|
23
27
|
Provides-Extra: flow-doctor
|
|
24
28
|
Requires-Dist: flow-doctor[diagnosis,s3]<0.5.0,>=0.4.0; extra == "flow-doctor"
|
|
25
29
|
Provides-Extra: rag
|
|
@@ -264,6 +268,7 @@ The shared institutional-analytics engine: pure, front-end- and data-source-agno
|
|
|
264
268
|
- **`quant.riskstats`** — `volatility`, `sharpe_ratio`, `sortino_ratio`, `max_drawdown` (stdlib).
|
|
265
269
|
- **`quant.returns`** — `xirr` (money-weighted, Newton + bisection), `time_weighted_return` (GIPS), `cumulative_return`, `annualize` (stdlib).
|
|
266
270
|
- **`quant.attribution`** — single-period Brinson-Fachler decomposition (`brinson_fachler`) + multi-period Cariño linking (`link_periods`) (stdlib).
|
|
271
|
+
- **`quant.stats`** — strategy/signal-quality evaluation metrics (lifted from the backtester's `analysis/`): `dsr` (Probabilistic + Deflated Sharpe, López de Prado), `information_coefficient` (Spearman rank IC), `expectancy` (hit-rate × win/loss decomposition), `multiple_testing` (Benjamini-Hochberg FDR), `risk_matched_benchmark` (EW-high-vol + beta-matched-SPY baselines + Information Ratio). **Needs pandas + scipy** — `pip install "alpha-engine-lib[quant-stats]"` (scipy is only the IC p-value; numpy fallback otherwise).
|
|
267
272
|
|
|
268
273
|
### `http_retry` — bounded-backoff transient-API retry chokepoint
|
|
269
274
|
|
|
@@ -229,6 +229,7 @@ The shared institutional-analytics engine: pure, front-end- and data-source-agno
|
|
|
229
229
|
- **`quant.riskstats`** — `volatility`, `sharpe_ratio`, `sortino_ratio`, `max_drawdown` (stdlib).
|
|
230
230
|
- **`quant.returns`** — `xirr` (money-weighted, Newton + bisection), `time_weighted_return` (GIPS), `cumulative_return`, `annualize` (stdlib).
|
|
231
231
|
- **`quant.attribution`** — single-period Brinson-Fachler decomposition (`brinson_fachler`) + multi-period Cariño linking (`link_periods`) (stdlib).
|
|
232
|
+
- **`quant.stats`** — strategy/signal-quality evaluation metrics (lifted from the backtester's `analysis/`): `dsr` (Probabilistic + Deflated Sharpe, López de Prado), `information_coefficient` (Spearman rank IC), `expectancy` (hit-rate × win/loss decomposition), `multiple_testing` (Benjamini-Hochberg FDR), `risk_matched_benchmark` (EW-high-vol + beta-matched-SPY baselines + Information Ratio). **Needs pandas + scipy** — `pip install "alpha-engine-lib[quant-stats]"` (scipy is only the IC p-value; numpy fallback otherwise).
|
|
232
233
|
|
|
233
234
|
### `http_retry` — bounded-backoff transient-API retry chokepoint
|
|
234
235
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "alpha-engine-lib"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.49.0"
|
|
8
8
|
description = "Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, S3-conditional-PUT writer locks, and bounded-backoff HTTP retry. Full surface documented in README."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
# EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops
|
|
@@ -39,6 +39,10 @@ quant = ["numpy>=1.24"]
|
|
|
39
39
|
# separate from [quant] so the numpy-only consumers (e.g. robodashboard)
|
|
40
40
|
# don't pull pandas+sklearn.
|
|
41
41
|
quant-xs = ["numpy>=1.24", "pandas>=2.0", "scikit-learn>=1.0"]
|
|
42
|
+
# Statistical evaluation utilities (alpha_engine_lib.quant.stats — PSR/DSR, IC,
|
|
43
|
+
# expectancy, BH-FDR, risk-matched benchmarks). numpy + pandas always; scipy is
|
|
44
|
+
# used by information_coefficient for the p-value (numpy fallback otherwise).
|
|
45
|
+
quant-stats = ["numpy>=1.24", "pandas>=2.0", "scipy>=1.7"]
|
|
42
46
|
flow_doctor = ["flow-doctor[diagnosis,s3]>=0.4.0,<0.5.0"]
|
|
43
47
|
rag = [
|
|
44
48
|
"psycopg2-binary>=2.9",
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Statistical evaluation utilities for signal/strategy quality assessment.
|
|
2
|
+
|
|
3
|
+
Pure-compute metrics consumed across the fleet (backtester, robodashboard) for
|
|
4
|
+
judging signal quality, strategy skill, and selection bias — no I/O. Import the
|
|
5
|
+
submodule you need (the package keeps no eager imports). Most need numpy+pandas;
|
|
6
|
+
``information_coefficient`` additionally uses scipy when present (with a numpy
|
|
7
|
+
fallback). Install ``alpha-engine-lib[quant-stats]``.
|
|
8
|
+
|
|
9
|
+
Modules:
|
|
10
|
+
- ``dsr`` — Probabilistic + Deflated Sharpe (López de Prado)
|
|
11
|
+
- ``information_coefficient`` — Spearman rank IC of conviction vs forward return
|
|
12
|
+
- ``expectancy`` — hit-rate × win/loss decomposition
|
|
13
|
+
- ``multiple_testing`` — Benjamini-Hochberg FDR correction
|
|
14
|
+
- ``risk_matched_benchmark`` — EW-high-vol + beta-matched-SPY baselines + IR
|
|
15
|
+
|
|
16
|
+
Example::
|
|
17
|
+
|
|
18
|
+
from alpha_engine_lib.quant.stats.dsr import compute_dsr
|
|
19
|
+
from alpha_engine_lib.quant.stats.multiple_testing import benjamini_hochberg
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""dsr — Probabilistic Sharpe Ratio (PSR) and Deflated Sharpe Ratio (DSR).
|
|
2
|
+
|
|
3
|
+
Confidence-adjusted Sharpe per López de Prado:
|
|
4
|
+
- PSR (Bailey & López de Prado 2012): probability that the *true* Sharpe
|
|
5
|
+
is above a benchmark, given the observed sample size + skew + kurtosis.
|
|
6
|
+
Answers "is this Sharpe distinguishable from the benchmark, given how
|
|
7
|
+
little data we have?"
|
|
8
|
+
- DSR (Bailey & López de Prado 2014): PSR with a multiple-testing
|
|
9
|
+
correction. The benchmark is set to the expected maximum Sharpe under
|
|
10
|
+
N independent trials, so DSR > 0.95 means "even after accounting for
|
|
11
|
+
cherry-picking from N candidates, this Sharpe is significant."
|
|
12
|
+
|
|
13
|
+
The promotion gate for any multiple-testing factory (param sweeps that
|
|
14
|
+
auto-promote the top-Sharpe combo): point-estimate Sharpe on a short sample
|
|
15
|
+
has a wide CI; DSR is what prevents promoting noise winners.
|
|
16
|
+
|
|
17
|
+
Mathematical reference:
|
|
18
|
+
Bailey & López de Prado (2012) "The Sharpe Ratio Efficient Frontier"
|
|
19
|
+
Bailey & López de Prado (2014) "The Deflated Sharpe Ratio: Correcting
|
|
20
|
+
for Selection Bias, Backtest Overfitting, and Non-Normality"
|
|
21
|
+
|
|
22
|
+
Pure-compute. Operates on a daily return series + sample-size metadata;
|
|
23
|
+
no I/O.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
import math
|
|
30
|
+
from typing import TypedDict
|
|
31
|
+
|
|
32
|
+
import numpy as np
|
|
33
|
+
import pandas as pd
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
_TRADING_DAYS_PER_YEAR = 252
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PSRResult(TypedDict, total=False):
|
|
41
|
+
status: str
|
|
42
|
+
n: int
|
|
43
|
+
sharpe: float # observed annualized Sharpe
|
|
44
|
+
sharpe_benchmark: float # benchmark Sharpe being tested against
|
|
45
|
+
psr: float # probability in [0, 1] that true SR > benchmark
|
|
46
|
+
skew: float
|
|
47
|
+
kurtosis: float
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DSRResult(TypedDict, total=False):
|
|
51
|
+
status: str
|
|
52
|
+
n: int
|
|
53
|
+
sharpe: float
|
|
54
|
+
n_trials: int # number of candidates considered (multiple-testing N)
|
|
55
|
+
sharpe_benchmark: float # implied benchmark from N_trials under H0: SR=0
|
|
56
|
+
dsr: float # probability that the true Sharpe survives selection bias
|
|
57
|
+
skew: float
|
|
58
|
+
kurtosis: float
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _normal_cdf(x: float) -> float:
|
|
62
|
+
"""Standard normal CDF — pure-Python, no scipy dependency."""
|
|
63
|
+
return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0)))
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _annualized_sharpe(returns: np.ndarray) -> float:
|
|
67
|
+
"""Annualized Sharpe (risk-free = 0), sample-std (ddof=1)."""
|
|
68
|
+
if returns.size < 2:
|
|
69
|
+
return 0.0
|
|
70
|
+
mean = float(returns.mean())
|
|
71
|
+
std = float(returns.std(ddof=1))
|
|
72
|
+
if std == 0.0:
|
|
73
|
+
return 0.0
|
|
74
|
+
return mean / std * math.sqrt(_TRADING_DAYS_PER_YEAR)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _sample_skew_kurtosis(returns: np.ndarray) -> tuple[float, float]:
|
|
78
|
+
"""Sample skewness and excess kurtosis. Pearson-style; scipy-equivalent.
|
|
79
|
+
|
|
80
|
+
Excess kurtosis = K - 3 (so a normal has 0 excess kurtosis).
|
|
81
|
+
Returns (0, 0) on insufficient sample.
|
|
82
|
+
"""
|
|
83
|
+
n = returns.size
|
|
84
|
+
if n < 4:
|
|
85
|
+
return 0.0, 0.0
|
|
86
|
+
mean = returns.mean()
|
|
87
|
+
centered = returns - mean
|
|
88
|
+
var = float((centered * centered).mean())
|
|
89
|
+
if var == 0.0:
|
|
90
|
+
return 0.0, 0.0
|
|
91
|
+
std = math.sqrt(var)
|
|
92
|
+
skew = float((centered ** 3).mean() / (std ** 3))
|
|
93
|
+
kurt_excess = float((centered ** 4).mean() / (var * var)) - 3.0
|
|
94
|
+
return skew, kurt_excess
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def compute_psr(
|
|
98
|
+
daily_returns: pd.Series | np.ndarray,
|
|
99
|
+
sharpe_benchmark: float = 0.0,
|
|
100
|
+
) -> PSRResult:
|
|
101
|
+
"""Probabilistic Sharpe Ratio.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
daily_returns : array-like
|
|
106
|
+
Daily simple returns. NaN dropped.
|
|
107
|
+
sharpe_benchmark : float
|
|
108
|
+
Annualized Sharpe to test against (default 0.0, i.e. "is the
|
|
109
|
+
true SR positive?").
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
PSRResult dict with:
|
|
114
|
+
status: "ok" | "insufficient_data"
|
|
115
|
+
n: sample size
|
|
116
|
+
sharpe: observed annualized SR
|
|
117
|
+
sharpe_benchmark: as input
|
|
118
|
+
psr: probability that true SR > benchmark
|
|
119
|
+
skew, kurtosis: moments of the return series
|
|
120
|
+
|
|
121
|
+
Formula (Bailey & López de Prado 2012):
|
|
122
|
+
PSR(SR*) = Phi( (SR_hat - SR*) * sqrt(n - 1)
|
|
123
|
+
/ sqrt(1 - skew * SR_hat + (kurtosis - 1)/4 * SR_hat^2) )
|
|
124
|
+
|
|
125
|
+
where SR_hat is the *non-annualized* observed Sharpe and SR* is the
|
|
126
|
+
benchmark on the same scale. We compute on daily Sharpe internally
|
|
127
|
+
and convert benchmarks accordingly.
|
|
128
|
+
"""
|
|
129
|
+
r = np.asarray(daily_returns, dtype=np.float64)
|
|
130
|
+
r = r[np.isfinite(r)]
|
|
131
|
+
n = r.size
|
|
132
|
+
if n < 30: # PSR is asymptotic; small samples produce nonsense
|
|
133
|
+
return {"status": "insufficient_data", "n": n}
|
|
134
|
+
|
|
135
|
+
sr_annualized = _annualized_sharpe(r)
|
|
136
|
+
# PSR formula uses the daily SR. Convert annualized benchmark back to daily.
|
|
137
|
+
sr_daily = sr_annualized / math.sqrt(_TRADING_DAYS_PER_YEAR)
|
|
138
|
+
sr_bench_daily = sharpe_benchmark / math.sqrt(_TRADING_DAYS_PER_YEAR)
|
|
139
|
+
|
|
140
|
+
skew, kurt_excess = _sample_skew_kurtosis(r)
|
|
141
|
+
# The "kurtosis" term in López de Prado's formula is the raw 4th
|
|
142
|
+
# moment / variance^2 (so 3.0 for a normal); we have excess kurtosis.
|
|
143
|
+
kurt_raw = kurt_excess + 3.0
|
|
144
|
+
|
|
145
|
+
denom_sq = 1.0 - skew * sr_daily + (kurt_raw - 1.0) / 4.0 * sr_daily ** 2
|
|
146
|
+
if denom_sq <= 0.0:
|
|
147
|
+
# Pathological skew/kurtosis combo; PSR formula breaks down.
|
|
148
|
+
return {
|
|
149
|
+
"status": "ok",
|
|
150
|
+
"n": n,
|
|
151
|
+
"sharpe": sr_annualized,
|
|
152
|
+
"sharpe_benchmark": sharpe_benchmark,
|
|
153
|
+
"psr": 0.5, # max-uncertainty fallback
|
|
154
|
+
"skew": skew,
|
|
155
|
+
"kurtosis": kurt_excess,
|
|
156
|
+
}
|
|
157
|
+
z = (sr_daily - sr_bench_daily) * math.sqrt(n - 1) / math.sqrt(denom_sq)
|
|
158
|
+
psr = _normal_cdf(z)
|
|
159
|
+
|
|
160
|
+
return {
|
|
161
|
+
"status": "ok",
|
|
162
|
+
"n": n,
|
|
163
|
+
"sharpe": sr_annualized,
|
|
164
|
+
"sharpe_benchmark": sharpe_benchmark,
|
|
165
|
+
"psr": float(psr),
|
|
166
|
+
"skew": skew,
|
|
167
|
+
"kurtosis": kurt_excess,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
_EULER_MASCHERONI = 0.5772156649015329
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def compute_dsr(
|
|
175
|
+
daily_returns: pd.Series | np.ndarray,
|
|
176
|
+
n_trials: int,
|
|
177
|
+
) -> DSRResult:
|
|
178
|
+
"""Deflated Sharpe Ratio.
|
|
179
|
+
|
|
180
|
+
Corrects PSR for the selection bias of choosing the maximum Sharpe
|
|
181
|
+
from ``n_trials`` candidates. The benchmark Sharpe is set to the
|
|
182
|
+
expected maximum SR under the null hypothesis (true SR = 0 for all
|
|
183
|
+
candidates), accounting for sample size + sample moments.
|
|
184
|
+
|
|
185
|
+
Parameters
|
|
186
|
+
----------
|
|
187
|
+
daily_returns : array-like
|
|
188
|
+
Daily returns of the *winner* (the candidate selected as best).
|
|
189
|
+
n_trials : int
|
|
190
|
+
Number of candidates considered when selecting this winner. For
|
|
191
|
+
a 60-combo param sweep, n_trials = 60. Must be >= 1.
|
|
192
|
+
|
|
193
|
+
Returns
|
|
194
|
+
-------
|
|
195
|
+
DSRResult dict with:
|
|
196
|
+
status, n, sharpe, n_trials, sharpe_benchmark, dsr, skew, kurtosis
|
|
197
|
+
|
|
198
|
+
Formula (Bailey & López de Prado 2014, Theorem 1):
|
|
199
|
+
E[max(SR)] ≈ V * (sqrt(2 ln N) - (gamma + ln ln N) / (2 sqrt(2 ln N)))
|
|
200
|
+
where V is the standard deviation of estimated SRs across trials and
|
|
201
|
+
gamma is Euler-Mascheroni. We approximate V with the sampling std of
|
|
202
|
+
SR_hat = sqrt((1 - skew*SR + (k-1)/4 * SR^2) / (n - 1)) on the winner.
|
|
203
|
+
|
|
204
|
+
DSR = PSR(SR_hat | benchmark = E[max(SR_null)]).
|
|
205
|
+
|
|
206
|
+
Notes
|
|
207
|
+
-----
|
|
208
|
+
- n_trials = 1 reduces to PSR(0) — no selection correction needed.
|
|
209
|
+
- For very high n_trials (>1000) the asymptotic expansion above is
|
|
210
|
+
adequate; for small n (< 5) it overstates the threshold slightly,
|
|
211
|
+
which is the conservative direction (harder to clear) — fine for
|
|
212
|
+
a promotion gate.
|
|
213
|
+
"""
|
|
214
|
+
if n_trials < 1:
|
|
215
|
+
raise ValueError(f"n_trials must be >= 1, got {n_trials}")
|
|
216
|
+
|
|
217
|
+
r = np.asarray(daily_returns, dtype=np.float64)
|
|
218
|
+
r = r[np.isfinite(r)]
|
|
219
|
+
n = r.size
|
|
220
|
+
if n < 30:
|
|
221
|
+
return {"status": "insufficient_data", "n": n, "n_trials": n_trials}
|
|
222
|
+
|
|
223
|
+
if n_trials == 1:
|
|
224
|
+
# No selection bias correction needed; reduce to PSR(0).
|
|
225
|
+
psr_result = compute_psr(r, sharpe_benchmark=0.0)
|
|
226
|
+
return {
|
|
227
|
+
"status": psr_result["status"],
|
|
228
|
+
"n": n,
|
|
229
|
+
"sharpe": psr_result.get("sharpe", 0.0),
|
|
230
|
+
"n_trials": 1,
|
|
231
|
+
"sharpe_benchmark": 0.0,
|
|
232
|
+
"dsr": psr_result.get("psr", 0.5),
|
|
233
|
+
"skew": psr_result.get("skew", 0.0),
|
|
234
|
+
"kurtosis": psr_result.get("kurtosis", 0.0),
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
sr_annualized = _annualized_sharpe(r)
|
|
238
|
+
sr_daily = sr_annualized / math.sqrt(_TRADING_DAYS_PER_YEAR)
|
|
239
|
+
skew, kurt_excess = _sample_skew_kurtosis(r)
|
|
240
|
+
kurt_raw = kurt_excess + 3.0
|
|
241
|
+
|
|
242
|
+
# Sampling std of SR_hat (per López de Prado eq. 5).
|
|
243
|
+
var_sr_sq = (1.0 - skew * sr_daily + (kurt_raw - 1.0) / 4.0 * sr_daily ** 2) / (n - 1)
|
|
244
|
+
if var_sr_sq <= 0.0:
|
|
245
|
+
return {
|
|
246
|
+
"status": "ok",
|
|
247
|
+
"n": n,
|
|
248
|
+
"sharpe": sr_annualized,
|
|
249
|
+
"n_trials": n_trials,
|
|
250
|
+
"sharpe_benchmark": 0.0,
|
|
251
|
+
"dsr": 0.5,
|
|
252
|
+
"skew": skew,
|
|
253
|
+
"kurtosis": kurt_excess,
|
|
254
|
+
}
|
|
255
|
+
v = math.sqrt(var_sr_sq)
|
|
256
|
+
|
|
257
|
+
# Expected max SR under the null, in daily SR units.
|
|
258
|
+
ln_n = math.log(n_trials)
|
|
259
|
+
sqrt_2_ln_n = math.sqrt(2.0 * ln_n)
|
|
260
|
+
if n_trials > 1:
|
|
261
|
+
ln_ln_n = math.log(ln_n) if ln_n > 0 else 0.0
|
|
262
|
+
else:
|
|
263
|
+
ln_ln_n = 0.0
|
|
264
|
+
expected_max_sr_daily = v * (sqrt_2_ln_n - (_EULER_MASCHERONI + ln_ln_n) / (2.0 * sqrt_2_ln_n))
|
|
265
|
+
expected_max_sr_annualized = expected_max_sr_daily * math.sqrt(_TRADING_DAYS_PER_YEAR)
|
|
266
|
+
|
|
267
|
+
psr_result = compute_psr(r, sharpe_benchmark=expected_max_sr_annualized)
|
|
268
|
+
|
|
269
|
+
return {
|
|
270
|
+
"status": psr_result["status"],
|
|
271
|
+
"n": n,
|
|
272
|
+
"sharpe": sr_annualized,
|
|
273
|
+
"n_trials": n_trials,
|
|
274
|
+
"sharpe_benchmark": expected_max_sr_annualized,
|
|
275
|
+
"dsr": psr_result.get("psr", 0.5),
|
|
276
|
+
"skew": skew,
|
|
277
|
+
"kurtosis": kurt_excess,
|
|
278
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""expectancy — hit rate × win/loss ratio decomposition.
|
|
2
|
+
|
|
3
|
+
The single most diagnostic breakdown for distinguishing skilled vs unskilled
|
|
4
|
+
risk-taking:
|
|
5
|
+
|
|
6
|
+
- **Selection skill**: high hit rate with symmetric W/L ratio (~1.0) — picks
|
|
7
|
+
winners more often than losers, magnitudes about equal.
|
|
8
|
+
- **Convexity skill**: moderate hit rate with W/L ratio > 1.5 — rides winners
|
|
9
|
+
and cuts losers; total expectancy positive even with <50% hit rate.
|
|
10
|
+
- **No skill (just YOLO into vol)**: declining hit rate with no compensating
|
|
11
|
+
W/L improvement → expectancy ≤ 0.
|
|
12
|
+
|
|
13
|
+
Formula:
|
|
14
|
+
expectancy = hit_rate * avg_win - (1 - hit_rate) * avg_loss
|
|
15
|
+
|
|
16
|
+
where avg_win is the mean of positive returns (or alpha) and avg_loss is the
|
|
17
|
+
mean *magnitude* of negative returns. Reports both expectancy and the
|
|
18
|
+
decomposition components so consumers can see WHICH dimension is failing.
|
|
19
|
+
|
|
20
|
+
Pure-compute. Operates on a returns array; no I/O.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import logging
|
|
26
|
+
from typing import TypedDict
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ExpectancyResult(TypedDict, total=False):
|
|
35
|
+
status: str
|
|
36
|
+
n: int
|
|
37
|
+
hit_rate: float
|
|
38
|
+
avg_win: float
|
|
39
|
+
avg_loss: float # magnitude (positive number)
|
|
40
|
+
win_loss_ratio: float
|
|
41
|
+
expectancy: float
|
|
42
|
+
expectancy_per_unit_loss: float # expectancy / avg_loss; the "R-multiple" expectancy
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def compute_expectancy(
|
|
46
|
+
returns: pd.Series | np.ndarray,
|
|
47
|
+
threshold: float = 0.0,
|
|
48
|
+
min_samples: int = 10,
|
|
49
|
+
) -> ExpectancyResult:
|
|
50
|
+
"""Compute expectancy decomposition over a return series.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
returns : pd.Series or np.ndarray
|
|
55
|
+
Per-trade or per-pick returns (or alphas). NaN dropped.
|
|
56
|
+
threshold : float
|
|
57
|
+
Win/loss boundary. Default 0 → wins are positive returns. Set
|
|
58
|
+
non-zero to compute relative to a benchmark return (e.g.
|
|
59
|
+
threshold = SPY_return for "did we beat SPY?" expectancy).
|
|
60
|
+
min_samples : int
|
|
61
|
+
Minimum non-NaN samples required to compute. Returns
|
|
62
|
+
status=insufficient_data below this floor. Default 10.
|
|
63
|
+
|
|
64
|
+
Returns
|
|
65
|
+
-------
|
|
66
|
+
ExpectancyResult dict with:
|
|
67
|
+
status: "ok" | "insufficient_data" | "no_wins" | "no_losses"
|
|
68
|
+
n: sample size
|
|
69
|
+
hit_rate: fraction of trades where return > threshold
|
|
70
|
+
avg_win: mean of returns > threshold (None if no wins)
|
|
71
|
+
avg_loss: mean magnitude of returns <= threshold (None if no losses)
|
|
72
|
+
win_loss_ratio: avg_win / avg_loss (None if either is missing/zero)
|
|
73
|
+
expectancy: hit_rate * avg_win - (1 - hit_rate) * avg_loss
|
|
74
|
+
expectancy_per_unit_loss: expectancy / avg_loss (R-multiple form)
|
|
75
|
+
|
|
76
|
+
Notes
|
|
77
|
+
-----
|
|
78
|
+
The R-multiple form (expectancy / avg_loss) is the "expectancy per unit of
|
|
79
|
+
risk taken" — useful for comparing across regimes where absolute return
|
|
80
|
+
levels shift but the ratio of skilled-edge-to-typical-loss is the
|
|
81
|
+
invariant signal of skill.
|
|
82
|
+
"""
|
|
83
|
+
arr = np.asarray(returns, dtype=np.float64)
|
|
84
|
+
arr = arr[np.isfinite(arr)]
|
|
85
|
+
n = arr.size
|
|
86
|
+
if n < min_samples:
|
|
87
|
+
return {"status": "insufficient_data", "n": n}
|
|
88
|
+
|
|
89
|
+
excess = arr - threshold
|
|
90
|
+
wins = excess[excess > 0]
|
|
91
|
+
losses = excess[excess <= 0]
|
|
92
|
+
|
|
93
|
+
hit_rate = float(wins.size) / n
|
|
94
|
+
|
|
95
|
+
if wins.size == 0:
|
|
96
|
+
return {
|
|
97
|
+
"status": "no_wins",
|
|
98
|
+
"n": n,
|
|
99
|
+
"hit_rate": 0.0,
|
|
100
|
+
"avg_win": None, # type: ignore[typeddict-item]
|
|
101
|
+
"avg_loss": float(-losses.mean()) if losses.size else 0.0,
|
|
102
|
+
"win_loss_ratio": None, # type: ignore[typeddict-item]
|
|
103
|
+
"expectancy": float(excess.mean()),
|
|
104
|
+
"expectancy_per_unit_loss": None, # type: ignore[typeddict-item]
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if losses.size == 0:
|
|
108
|
+
return {
|
|
109
|
+
"status": "no_losses",
|
|
110
|
+
"n": n,
|
|
111
|
+
"hit_rate": 1.0,
|
|
112
|
+
"avg_win": float(wins.mean()),
|
|
113
|
+
"avg_loss": 0.0,
|
|
114
|
+
"win_loss_ratio": None, # type: ignore[typeddict-item]
|
|
115
|
+
"expectancy": float(excess.mean()),
|
|
116
|
+
"expectancy_per_unit_loss": None, # type: ignore[typeddict-item]
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
avg_win = float(wins.mean())
|
|
120
|
+
avg_loss = float(-losses.mean()) # report as positive magnitude
|
|
121
|
+
win_loss_ratio = avg_win / avg_loss if avg_loss > 0 else None
|
|
122
|
+
expectancy = hit_rate * avg_win - (1.0 - hit_rate) * avg_loss
|
|
123
|
+
epul = expectancy / avg_loss if avg_loss > 0 else None
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"status": "ok",
|
|
127
|
+
"n": n,
|
|
128
|
+
"hit_rate": hit_rate,
|
|
129
|
+
"avg_win": avg_win,
|
|
130
|
+
"avg_loss": avg_loss,
|
|
131
|
+
"win_loss_ratio": win_loss_ratio, # type: ignore[typeddict-item]
|
|
132
|
+
"expectancy": expectancy,
|
|
133
|
+
"expectancy_per_unit_loss": epul, # type: ignore[typeddict-item]
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def compute_expectancy_by_group(
|
|
138
|
+
df: pd.DataFrame,
|
|
139
|
+
return_col: str,
|
|
140
|
+
group_col: str,
|
|
141
|
+
threshold: float = 0.0,
|
|
142
|
+
min_samples: int = 10,
|
|
143
|
+
) -> dict[str, ExpectancyResult]:
|
|
144
|
+
"""Stratify expectancy by a grouping column (team_id, conviction, sector, ...).
|
|
145
|
+
|
|
146
|
+
Returns
|
|
147
|
+
-------
|
|
148
|
+
dict[group_value, ExpectancyResult]
|
|
149
|
+
One result per group. Groups below ``min_samples`` get
|
|
150
|
+
status=insufficient_data entries.
|
|
151
|
+
"""
|
|
152
|
+
if return_col not in df.columns:
|
|
153
|
+
raise KeyError(f"return_col {return_col!r} not in dataframe")
|
|
154
|
+
if group_col not in df.columns:
|
|
155
|
+
raise KeyError(f"group_col {group_col!r} not in dataframe")
|
|
156
|
+
out: dict[str, ExpectancyResult] = {}
|
|
157
|
+
for group_value, sub in df.groupby(group_col):
|
|
158
|
+
out[str(group_value)] = compute_expectancy(
|
|
159
|
+
sub[return_col], threshold=threshold, min_samples=min_samples,
|
|
160
|
+
)
|
|
161
|
+
return out
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""information_coefficient — Spearman rank correlation between a conviction/
|
|
2
|
+
score and forward returns.
|
|
3
|
+
|
|
4
|
+
The risk-invariant quality metric. IC of 0.05 on conservative picks vs 0.02 on
|
|
5
|
+
aggressive picks tells you the signal is *worse* at the harder task even if
|
|
6
|
+
absolute returns went up.
|
|
7
|
+
|
|
8
|
+
Why Spearman over Pearson: rank correlation is invariant to the scale and
|
|
9
|
+
distribution of the conviction scores — they only need to *order* picks
|
|
10
|
+
correctly; absolute calibration is a separate concern.
|
|
11
|
+
|
|
12
|
+
Pure-compute. Operates on parallel arrays of (conviction, return); no I/O.
|
|
13
|
+
scipy.stats.spearmanr is used when available (for the p-value); otherwise a
|
|
14
|
+
numpy rank-then-Pearson fallback gives the identical IC with no p-value.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from typing import TypedDict
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ICResult(TypedDict, total=False):
|
|
29
|
+
status: str
|
|
30
|
+
n: int
|
|
31
|
+
ic: float # Spearman rank correlation
|
|
32
|
+
p_value: float # two-sided p-value vs null hypothesis IC = 0
|
|
33
|
+
n_buckets: int # number of distinct conviction levels
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def compute_ic(
|
|
37
|
+
conviction: pd.Series | np.ndarray,
|
|
38
|
+
forward_return: pd.Series | np.ndarray,
|
|
39
|
+
min_samples: int = 20,
|
|
40
|
+
) -> ICResult:
|
|
41
|
+
"""Compute Spearman rank correlation between conviction + forward return.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
conviction : array-like
|
|
46
|
+
Stated conviction or composite score per pick. Higher = more
|
|
47
|
+
confident. Numeric or ordinal (string ranks need to be encoded
|
|
48
|
+
upstream — Spearman is on the rank, but the array must sort
|
|
49
|
+
meaningfully).
|
|
50
|
+
forward_return : array-like
|
|
51
|
+
Realized forward return per pick over the prediction horizon.
|
|
52
|
+
Same length as ``conviction``. NaN in either column → that pair
|
|
53
|
+
is dropped before computing.
|
|
54
|
+
min_samples : int
|
|
55
|
+
Minimum valid pairs required. Below this floor returns
|
|
56
|
+
status=insufficient_data. Default 20 (Spearman p-values are
|
|
57
|
+
unreliable on small samples).
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
ICResult dict with:
|
|
62
|
+
status: "ok" | "insufficient_data" | "no_variance"
|
|
63
|
+
n: number of (conviction, return) pairs after NaN filtering
|
|
64
|
+
ic: Spearman rank correlation in [-1, 1]
|
|
65
|
+
p_value: two-sided p-value
|
|
66
|
+
n_buckets: number of distinct conviction levels (low = collapsed
|
|
67
|
+
conviction signal — the source isn't differentiating
|
|
68
|
+
even if IC happens to look fine)
|
|
69
|
+
|
|
70
|
+
Notes
|
|
71
|
+
-----
|
|
72
|
+
- Uses scipy.stats.spearmanr if available; falls back to numpy
|
|
73
|
+
rank-then-pearson which gives identical IC but no p-value.
|
|
74
|
+
- n_buckets is reported separately so callers can flag the
|
|
75
|
+
degenerate case where conviction = constant (IC undefined;
|
|
76
|
+
correlation of constant with anything is 0).
|
|
77
|
+
"""
|
|
78
|
+
c = np.asarray(conviction, dtype=np.float64)
|
|
79
|
+
r = np.asarray(forward_return, dtype=np.float64)
|
|
80
|
+
if c.size != r.size:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"conviction (n={c.size}) and forward_return (n={r.size}) "
|
|
83
|
+
"must be same length"
|
|
84
|
+
)
|
|
85
|
+
valid = np.isfinite(c) & np.isfinite(r)
|
|
86
|
+
c = c[valid]
|
|
87
|
+
r = r[valid]
|
|
88
|
+
n = c.size
|
|
89
|
+
if n < min_samples:
|
|
90
|
+
return {"status": "insufficient_data", "n": n}
|
|
91
|
+
|
|
92
|
+
n_buckets = int(np.unique(c).size)
|
|
93
|
+
# min == max is the exact constancy test (avoids float64 std residual).
|
|
94
|
+
c_const = c.size > 0 and c.min() == c.max()
|
|
95
|
+
r_const = r.size > 0 and r.min() == r.max()
|
|
96
|
+
if n_buckets < 2 or c_const or r_const:
|
|
97
|
+
return {
|
|
98
|
+
"status": "no_variance",
|
|
99
|
+
"n": n,
|
|
100
|
+
"n_buckets": n_buckets,
|
|
101
|
+
"ic": 0.0,
|
|
102
|
+
"p_value": 1.0,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
from scipy.stats import spearmanr # type: ignore[import-not-found]
|
|
107
|
+
|
|
108
|
+
result = spearmanr(c, r)
|
|
109
|
+
ic = float(result.statistic)
|
|
110
|
+
p_value = float(result.pvalue)
|
|
111
|
+
except Exception:
|
|
112
|
+
# Fallback: rank-then-pearson. Identical IC value; no p-value.
|
|
113
|
+
c_rank = pd.Series(c).rank().to_numpy()
|
|
114
|
+
r_rank = pd.Series(r).rank().to_numpy()
|
|
115
|
+
ic = float(np.corrcoef(c_rank, r_rank)[0, 1])
|
|
116
|
+
p_value = float("nan")
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
"status": "ok",
|
|
120
|
+
"n": n,
|
|
121
|
+
"n_buckets": n_buckets,
|
|
122
|
+
"ic": ic,
|
|
123
|
+
"p_value": p_value,
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def compute_ic_by_bucket(
|
|
128
|
+
df: pd.DataFrame,
|
|
129
|
+
conviction_col: str,
|
|
130
|
+
return_col: str,
|
|
131
|
+
bucket_col: str,
|
|
132
|
+
min_samples: int = 20,
|
|
133
|
+
) -> dict[str, ICResult]:
|
|
134
|
+
"""IC stratified by a bucket column (sector, conviction tier, regime, ...).
|
|
135
|
+
|
|
136
|
+
The "is the signal good at the harder task" cut: split by conviction
|
|
137
|
+
decile or sector and compute IC within each. A signal whose IC drops
|
|
138
|
+
on its highest-conviction picks is failing exactly where it claims
|
|
139
|
+
to be most confident.
|
|
140
|
+
"""
|
|
141
|
+
for col in (conviction_col, return_col, bucket_col):
|
|
142
|
+
if col not in df.columns:
|
|
143
|
+
raise KeyError(f"column {col!r} not in dataframe")
|
|
144
|
+
out: dict[str, ICResult] = {}
|
|
145
|
+
for bucket_value, sub in df.groupby(bucket_col):
|
|
146
|
+
out[str(bucket_value)] = compute_ic(
|
|
147
|
+
sub[conviction_col], sub[return_col], min_samples=min_samples,
|
|
148
|
+
)
|
|
149
|
+
return out
|