alpha-engine-lib 0.48.0__tar.gz → 0.49.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/PKG-INFO +6 -1
  2. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/README.md +1 -0
  3. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/pyproject.toml +5 -1
  4. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/__init__.py +1 -1
  5. alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/__init__.py +22 -0
  6. alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/dsr.py +278 -0
  7. alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/expectancy.py +161 -0
  8. alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/information_coefficient.py +149 -0
  9. alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/multiple_testing.py +48 -0
  10. alpha_engine_lib-0.49.0/src/alpha_engine_lib/quant/stats/risk_matched_benchmark.py +305 -0
  11. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/PKG-INFO +6 -1
  12. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/SOURCES.txt +11 -0
  13. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/requires.txt +5 -0
  14. alpha_engine_lib-0.49.0/tests/test_quant_stats_dsr.py +95 -0
  15. alpha_engine_lib-0.49.0/tests/test_quant_stats_expectancy.py +102 -0
  16. alpha_engine_lib-0.49.0/tests/test_quant_stats_information_coefficient.py +100 -0
  17. alpha_engine_lib-0.49.0/tests/test_quant_stats_multiple_testing.py +42 -0
  18. alpha_engine_lib-0.49.0/tests/test_quant_stats_risk_matched_benchmark.py +140 -0
  19. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/setup.cfg +0 -0
  20. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/agent_schemas.py +0 -0
  21. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/alerts.py +0 -0
  22. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/anthropic_payload.py +0 -0
  23. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/arcticdb.py +0 -0
  24. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/artifact_freshness.py +0 -0
  25. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/collector_results.py +0 -0
  26. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/cost.py +0 -0
  27. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/dates.py +0 -0
  28. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/decision_capture.py +0 -0
  29. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/ec2_spot.py +0 -0
  30. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/email_sender.py +0 -0
  31. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/eval_artifacts.py +0 -0
  32. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/http_retry.py +0 -0
  33. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/locks.py +0 -0
  34. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/logging.py +0 -0
  35. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/model_pricing.yaml +0 -0
  36. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pillars.py +0 -0
  37. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/__init__.py +0 -0
  38. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/read.py +0 -0
  39. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/registry.py +0 -0
  40. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/pipeline_status/templates.py +0 -0
  41. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/preflight.py +0 -0
  42. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/__init__.py +0 -0
  43. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/attribution.py +0 -0
  44. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/factor_risk.py +0 -0
  45. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/factor_risk_xs.py +0 -0
  46. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/returns.py +0 -0
  47. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/risk_measures.py +0 -0
  48. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/quant/riskstats.py +0 -0
  49. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/__init__.py +0 -0
  50. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/db.py +0 -0
  51. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/embeddings.py +0 -0
  52. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/migrations/0001_content_tsv.sql +0 -0
  53. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/rerank.py +0 -0
  54. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/retrieval.py +0 -0
  55. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/rag/schema.sql +0 -0
  56. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/reconcile.py +0 -0
  57. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/secrets.py +0 -0
  58. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/sources/__init__.py +0 -0
  59. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/sources/protocols.py +0 -0
  60. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/ssm_dispatcher.py +0 -0
  61. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/ssm_log_capture.py +0 -0
  62. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/telegram.py +0 -0
  63. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/trading_calendar.py +0 -0
  64. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/transparency.py +0 -0
  65. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/transparency_inventory.yaml +0 -0
  66. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib/universe.py +0 -0
  67. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/dependency_links.txt +0 -0
  68. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/src/alpha_engine_lib.egg-info/top_level.txt +0 -0
  69. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_agent_schemas.py +0 -0
  70. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_alerts.py +0 -0
  71. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_anthropic_payload.py +0 -0
  72. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_arcticdb.py +0 -0
  73. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_artifact_freshness.py +0 -0
  74. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_collector_results.py +0 -0
  75. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_cost.py +0 -0
  76. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_dates.py +0 -0
  77. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_decision_capture.py +0 -0
  78. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_ec2_spot.py +0 -0
  79. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_email_sender.py +0 -0
  80. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_eval_artifacts.py +0 -0
  81. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_http_retry.py +0 -0
  82. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_locks.py +0 -0
  83. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_logging.py +0 -0
  84. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pillars.py +0 -0
  85. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pipeline_status_read.py +0 -0
  86. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pipeline_status_registry.py +0 -0
  87. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_pipeline_status_templates.py +0 -0
  88. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_preflight.py +0 -0
  89. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_attribution.py +0 -0
  90. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_factor_risk.py +0 -0
  91. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_factor_risk_xs.py +0 -0
  92. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_returns.py +0 -0
  93. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_risk_measures.py +0 -0
  94. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_quant_riskstats.py +0 -0
  95. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_rag.py +0 -0
  96. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_rag_rerank.py +0 -0
  97. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_rag_retrieval_hybrid.py +0 -0
  98. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_reconcile.py +0 -0
  99. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_secrets.py +0 -0
  100. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_sources_protocols.py +0 -0
  101. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_ssm_dispatcher.py +0 -0
  102. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_ssm_log_capture.py +0 -0
  103. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_telegram.py +0 -0
  104. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_trading_calendar.py +0 -0
  105. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_transparency.py +0 -0
  106. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_universe.py +0 -0
  107. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_version_bump_workflow.py +0 -0
  108. {alpha_engine_lib-0.48.0 → alpha_engine_lib-0.49.0}/tests/test_version_pin.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alpha-engine-lib
3
- Version: 0.48.0
3
+ Version: 0.49.0
4
4
  Summary: Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, S3-conditional-PUT writer locks, and bounded-backoff HTTP retry. Full surface documented in README.
5
5
  Author: Brian McMahon
6
6
  License: Proprietary
@@ -20,6 +20,10 @@ Provides-Extra: quant-xs
20
20
  Requires-Dist: numpy>=1.24; extra == "quant-xs"
21
21
  Requires-Dist: pandas>=2.0; extra == "quant-xs"
22
22
  Requires-Dist: scikit-learn>=1.0; extra == "quant-xs"
23
+ Provides-Extra: quant-stats
24
+ Requires-Dist: numpy>=1.24; extra == "quant-stats"
25
+ Requires-Dist: pandas>=2.0; extra == "quant-stats"
26
+ Requires-Dist: scipy>=1.7; extra == "quant-stats"
23
27
  Provides-Extra: flow-doctor
24
28
  Requires-Dist: flow-doctor[diagnosis,s3]<0.5.0,>=0.4.0; extra == "flow-doctor"
25
29
  Provides-Extra: rag
@@ -264,6 +268,7 @@ The shared institutional-analytics engine: pure, front-end- and data-source-agno
264
268
  - **`quant.riskstats`** — `volatility`, `sharpe_ratio`, `sortino_ratio`, `max_drawdown` (stdlib).
265
269
  - **`quant.returns`** — `xirr` (money-weighted, Newton + bisection), `time_weighted_return` (GIPS), `cumulative_return`, `annualize` (stdlib).
266
270
  - **`quant.attribution`** — single-period Brinson-Fachler decomposition (`brinson_fachler`) + multi-period Cariño linking (`link_periods`) (stdlib).
271
+ - **`quant.stats`** — strategy/signal-quality evaluation metrics (lifted from the backtester's `analysis/`): `dsr` (Probabilistic + Deflated Sharpe, López de Prado), `information_coefficient` (Spearman rank IC), `expectancy` (hit-rate × win/loss decomposition), `multiple_testing` (Benjamini-Hochberg FDR), `risk_matched_benchmark` (EW-high-vol + beta-matched-SPY baselines + Information Ratio). **Needs pandas + scipy** — `pip install "alpha-engine-lib[quant-stats]"` (scipy is only the IC p-value; numpy fallback otherwise).
267
272
 
268
273
  ### `http_retry` — bounded-backoff transient-API retry chokepoint
269
274
 
@@ -229,6 +229,7 @@ The shared institutional-analytics engine: pure, front-end- and data-source-agno
229
229
  - **`quant.riskstats`** — `volatility`, `sharpe_ratio`, `sortino_ratio`, `max_drawdown` (stdlib).
230
230
  - **`quant.returns`** — `xirr` (money-weighted, Newton + bisection), `time_weighted_return` (GIPS), `cumulative_return`, `annualize` (stdlib).
231
231
  - **`quant.attribution`** — single-period Brinson-Fachler decomposition (`brinson_fachler`) + multi-period Cariño linking (`link_periods`) (stdlib).
232
+ - **`quant.stats`** — strategy/signal-quality evaluation metrics (lifted from the backtester's `analysis/`): `dsr` (Probabilistic + Deflated Sharpe, López de Prado), `information_coefficient` (Spearman rank IC), `expectancy` (hit-rate × win/loss decomposition), `multiple_testing` (Benjamini-Hochberg FDR), `risk_matched_benchmark` (EW-high-vol + beta-matched-SPY baselines + Information Ratio). **Needs pandas + scipy** — `pip install "alpha-engine-lib[quant-stats]"` (scipy is only the IC p-value; numpy fallback otherwise).
232
233
 
233
234
  ### `http_retry` — bounded-backoff transient-API retry chokepoint
234
235
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "alpha-engine-lib"
7
- version = "0.48.0"
7
+ version = "0.49.0"
8
8
  description = "Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, S3-conditional-PUT writer locks, and bounded-backoff HTTP retry. Full surface documented in README."
9
9
  readme = "README.md"
10
10
  # EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops
@@ -39,6 +39,10 @@ quant = ["numpy>=1.24"]
39
39
  # separate from [quant] so the numpy-only consumers (e.g. robodashboard)
40
40
  # don't pull pandas+sklearn.
41
41
  quant-xs = ["numpy>=1.24", "pandas>=2.0", "scikit-learn>=1.0"]
42
+ # Statistical evaluation utilities (alpha_engine_lib.quant.stats — PSR/DSR, IC,
43
+ # expectancy, BH-FDR, risk-matched benchmarks). numpy + pandas always; scipy is
44
+ # used by information_coefficient for the p-value (numpy fallback otherwise).
45
+ quant-stats = ["numpy>=1.24", "pandas>=2.0", "scipy>=1.7"]
42
46
  flow_doctor = ["flow-doctor[diagnosis,s3]>=0.4.0,<0.5.0"]
43
47
  rag = [
44
48
  "psycopg2-binary>=2.9",
@@ -1,3 +1,3 @@
1
1
  """alpha-engine-lib — shared utilities for Alpha Engine modules."""
2
2
 
3
- __version__ = "0.48.0"
3
+ __version__ = "0.49.0"
@@ -0,0 +1,22 @@
1
+ """Statistical evaluation utilities for signal/strategy quality assessment.
2
+
3
+ Pure-compute metrics consumed across the fleet (backtester, robodashboard) for
4
+ judging signal quality, strategy skill, and selection bias — no I/O. Import the
5
+ submodule you need (the package keeps no eager imports). Most need numpy+pandas;
6
+ ``information_coefficient`` additionally uses scipy when present (with a numpy
7
+ fallback). Install ``alpha-engine-lib[quant-stats]``.
8
+
9
+ Modules:
10
+ - ``dsr`` — Probabilistic + Deflated Sharpe (López de Prado)
11
+ - ``information_coefficient`` — Spearman rank IC of conviction vs forward return
12
+ - ``expectancy`` — hit-rate × win/loss decomposition
13
+ - ``multiple_testing`` — Benjamini-Hochberg FDR correction
14
+ - ``risk_matched_benchmark`` — EW-high-vol + beta-matched-SPY baselines + IR
15
+
16
+ Example::
17
+
18
+ from alpha_engine_lib.quant.stats.dsr import compute_dsr
19
+ from alpha_engine_lib.quant.stats.multiple_testing import benjamini_hochberg
20
+ """
21
+
22
+ from __future__ import annotations
@@ -0,0 +1,278 @@
1
+ """dsr — Probabilistic Sharpe Ratio (PSR) and Deflated Sharpe Ratio (DSR).
2
+
3
+ Confidence-adjusted Sharpe per López de Prado:
4
+ - PSR (Bailey & López de Prado 2012): probability that the *true* Sharpe
5
+ is above a benchmark, given the observed sample size + skew + kurtosis.
6
+ Answers "is this Sharpe distinguishable from the benchmark, given how
7
+ little data we have?"
8
+ - DSR (Bailey & López de Prado 2014): PSR with a multiple-testing
9
+ correction. The benchmark is set to the expected maximum Sharpe under
10
+ N independent trials, so DSR > 0.95 means "even after accounting for
11
+ cherry-picking from N candidates, this Sharpe is significant."
12
+
13
+ The promotion gate for any multiple-testing factory (param sweeps that
14
+ auto-promote the top-Sharpe combo): point-estimate Sharpe on a short sample
15
+ has a wide CI; DSR is what prevents promoting noise winners.
16
+
17
+ Mathematical reference:
18
+ Bailey & López de Prado (2012) "The Sharpe Ratio Efficient Frontier"
19
+ Bailey & López de Prado (2014) "The Deflated Sharpe Ratio: Correcting
20
+ for Selection Bias, Backtest Overfitting, and Non-Normality"
21
+
22
+ Pure-compute. Operates on a daily return series + sample-size metadata;
23
+ no I/O.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import logging
29
+ import math
30
+ from typing import TypedDict
31
+
32
+ import numpy as np
33
+ import pandas as pd
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ _TRADING_DAYS_PER_YEAR = 252
38
+
39
+
40
+ class PSRResult(TypedDict, total=False):
41
+ status: str
42
+ n: int
43
+ sharpe: float # observed annualized Sharpe
44
+ sharpe_benchmark: float # benchmark Sharpe being tested against
45
+ psr: float # probability in [0, 1] that true SR > benchmark
46
+ skew: float
47
+ kurtosis: float
48
+
49
+
50
+ class DSRResult(TypedDict, total=False):
51
+ status: str
52
+ n: int
53
+ sharpe: float
54
+ n_trials: int # number of candidates considered (multiple-testing N)
55
+ sharpe_benchmark: float # implied benchmark from N_trials under H0: SR=0
56
+ dsr: float # probability that the true Sharpe survives selection bias
57
+ skew: float
58
+ kurtosis: float
59
+
60
+
61
+ def _normal_cdf(x: float) -> float:
62
+ """Standard normal CDF — pure-Python, no scipy dependency."""
63
+ return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0)))
64
+
65
+
66
+ def _annualized_sharpe(returns: np.ndarray) -> float:
67
+ """Annualized Sharpe (risk-free = 0), sample-std (ddof=1)."""
68
+ if returns.size < 2:
69
+ return 0.0
70
+ mean = float(returns.mean())
71
+ std = float(returns.std(ddof=1))
72
+ if std == 0.0:
73
+ return 0.0
74
+ return mean / std * math.sqrt(_TRADING_DAYS_PER_YEAR)
75
+
76
+
77
+ def _sample_skew_kurtosis(returns: np.ndarray) -> tuple[float, float]:
78
+ """Sample skewness and excess kurtosis. Pearson-style; scipy-equivalent.
79
+
80
+ Excess kurtosis = K - 3 (so a normal has 0 excess kurtosis).
81
+ Returns (0, 0) on insufficient sample.
82
+ """
83
+ n = returns.size
84
+ if n < 4:
85
+ return 0.0, 0.0
86
+ mean = returns.mean()
87
+ centered = returns - mean
88
+ var = float((centered * centered).mean())
89
+ if var == 0.0:
90
+ return 0.0, 0.0
91
+ std = math.sqrt(var)
92
+ skew = float((centered ** 3).mean() / (std ** 3))
93
+ kurt_excess = float((centered ** 4).mean() / (var * var)) - 3.0
94
+ return skew, kurt_excess
95
+
96
+
97
+ def compute_psr(
98
+ daily_returns: pd.Series | np.ndarray,
99
+ sharpe_benchmark: float = 0.0,
100
+ ) -> PSRResult:
101
+ """Probabilistic Sharpe Ratio.
102
+
103
+ Parameters
104
+ ----------
105
+ daily_returns : array-like
106
+ Daily simple returns. NaN dropped.
107
+ sharpe_benchmark : float
108
+ Annualized Sharpe to test against (default 0.0, i.e. "is the
109
+ true SR positive?").
110
+
111
+ Returns
112
+ -------
113
+ PSRResult dict with:
114
+ status: "ok" | "insufficient_data"
115
+ n: sample size
116
+ sharpe: observed annualized SR
117
+ sharpe_benchmark: as input
118
+ psr: probability that true SR > benchmark
119
+ skew, kurtosis: moments of the return series
120
+
121
+ Formula (Bailey & López de Prado 2012):
122
+ PSR(SR*) = Phi( (SR_hat - SR*) * sqrt(n - 1)
123
+ / sqrt(1 - skew * SR_hat + (kurtosis - 1)/4 * SR_hat^2) )
124
+
125
+ where SR_hat is the *non-annualized* observed Sharpe and SR* is the
126
+ benchmark on the same scale. We compute on daily Sharpe internally
127
+ and convert benchmarks accordingly.
128
+ """
129
+ r = np.asarray(daily_returns, dtype=np.float64)
130
+ r = r[np.isfinite(r)]
131
+ n = r.size
132
+ if n < 30: # PSR is asymptotic; small samples produce nonsense
133
+ return {"status": "insufficient_data", "n": n}
134
+
135
+ sr_annualized = _annualized_sharpe(r)
136
+ # PSR formula uses the daily SR. Convert annualized benchmark back to daily.
137
+ sr_daily = sr_annualized / math.sqrt(_TRADING_DAYS_PER_YEAR)
138
+ sr_bench_daily = sharpe_benchmark / math.sqrt(_TRADING_DAYS_PER_YEAR)
139
+
140
+ skew, kurt_excess = _sample_skew_kurtosis(r)
141
+ # The "kurtosis" term in López de Prado's formula is the raw 4th
142
+ # moment / variance^2 (so 3.0 for a normal); we have excess kurtosis.
143
+ kurt_raw = kurt_excess + 3.0
144
+
145
+ denom_sq = 1.0 - skew * sr_daily + (kurt_raw - 1.0) / 4.0 * sr_daily ** 2
146
+ if denom_sq <= 0.0:
147
+ # Pathological skew/kurtosis combo; PSR formula breaks down.
148
+ return {
149
+ "status": "ok",
150
+ "n": n,
151
+ "sharpe": sr_annualized,
152
+ "sharpe_benchmark": sharpe_benchmark,
153
+ "psr": 0.5, # max-uncertainty fallback
154
+ "skew": skew,
155
+ "kurtosis": kurt_excess,
156
+ }
157
+ z = (sr_daily - sr_bench_daily) * math.sqrt(n - 1) / math.sqrt(denom_sq)
158
+ psr = _normal_cdf(z)
159
+
160
+ return {
161
+ "status": "ok",
162
+ "n": n,
163
+ "sharpe": sr_annualized,
164
+ "sharpe_benchmark": sharpe_benchmark,
165
+ "psr": float(psr),
166
+ "skew": skew,
167
+ "kurtosis": kurt_excess,
168
+ }
169
+
170
+
171
+ _EULER_MASCHERONI = 0.5772156649015329
172
+
173
+
174
+ def compute_dsr(
175
+ daily_returns: pd.Series | np.ndarray,
176
+ n_trials: int,
177
+ ) -> DSRResult:
178
+ """Deflated Sharpe Ratio.
179
+
180
+ Corrects PSR for the selection bias of choosing the maximum Sharpe
181
+ from ``n_trials`` candidates. The benchmark Sharpe is set to the
182
+ expected maximum SR under the null hypothesis (true SR = 0 for all
183
+ candidates), accounting for sample size + sample moments.
184
+
185
+ Parameters
186
+ ----------
187
+ daily_returns : array-like
188
+ Daily returns of the *winner* (the candidate selected as best).
189
+ n_trials : int
190
+ Number of candidates considered when selecting this winner. For
191
+ a 60-combo param sweep, n_trials = 60. Must be >= 1.
192
+
193
+ Returns
194
+ -------
195
+ DSRResult dict with:
196
+ status, n, sharpe, n_trials, sharpe_benchmark, dsr, skew, kurtosis
197
+
198
+ Formula (Bailey & López de Prado 2014, Theorem 1):
199
+ E[max(SR)] ≈ V * (sqrt(2 ln N) - (gamma + ln ln N) / (2 sqrt(2 ln N)))
200
+ where V is the standard deviation of estimated SRs across trials and
201
+ gamma is Euler-Mascheroni. We approximate V with the sampling std of
202
+ SR_hat = sqrt((1 - skew*SR + (k-1)/4 * SR^2) / (n - 1)) on the winner.
203
+
204
+ DSR = PSR(SR_hat | benchmark = E[max(SR_null)]).
205
+
206
+ Notes
207
+ -----
208
+ - n_trials = 1 reduces to PSR(0) — no selection correction needed.
209
+ - For very high n_trials (>1000) the asymptotic expansion above is
210
+ adequate; for small n (< 5) it overstates the threshold slightly,
211
+ which is the conservative direction (harder to clear) — fine for
212
+ a promotion gate.
213
+ """
214
+ if n_trials < 1:
215
+ raise ValueError(f"n_trials must be >= 1, got {n_trials}")
216
+
217
+ r = np.asarray(daily_returns, dtype=np.float64)
218
+ r = r[np.isfinite(r)]
219
+ n = r.size
220
+ if n < 30:
221
+ return {"status": "insufficient_data", "n": n, "n_trials": n_trials}
222
+
223
+ if n_trials == 1:
224
+ # No selection bias correction needed; reduce to PSR(0).
225
+ psr_result = compute_psr(r, sharpe_benchmark=0.0)
226
+ return {
227
+ "status": psr_result["status"],
228
+ "n": n,
229
+ "sharpe": psr_result.get("sharpe", 0.0),
230
+ "n_trials": 1,
231
+ "sharpe_benchmark": 0.0,
232
+ "dsr": psr_result.get("psr", 0.5),
233
+ "skew": psr_result.get("skew", 0.0),
234
+ "kurtosis": psr_result.get("kurtosis", 0.0),
235
+ }
236
+
237
+ sr_annualized = _annualized_sharpe(r)
238
+ sr_daily = sr_annualized / math.sqrt(_TRADING_DAYS_PER_YEAR)
239
+ skew, kurt_excess = _sample_skew_kurtosis(r)
240
+ kurt_raw = kurt_excess + 3.0
241
+
242
+ # Sampling std of SR_hat (per López de Prado eq. 5).
243
+ var_sr_sq = (1.0 - skew * sr_daily + (kurt_raw - 1.0) / 4.0 * sr_daily ** 2) / (n - 1)
244
+ if var_sr_sq <= 0.0:
245
+ return {
246
+ "status": "ok",
247
+ "n": n,
248
+ "sharpe": sr_annualized,
249
+ "n_trials": n_trials,
250
+ "sharpe_benchmark": 0.0,
251
+ "dsr": 0.5,
252
+ "skew": skew,
253
+ "kurtosis": kurt_excess,
254
+ }
255
+ v = math.sqrt(var_sr_sq)
256
+
257
+ # Expected max SR under the null, in daily SR units.
258
+ ln_n = math.log(n_trials)
259
+ sqrt_2_ln_n = math.sqrt(2.0 * ln_n)
260
+ if n_trials > 1:
261
+ ln_ln_n = math.log(ln_n) if ln_n > 0 else 0.0
262
+ else:
263
+ ln_ln_n = 0.0
264
+ expected_max_sr_daily = v * (sqrt_2_ln_n - (_EULER_MASCHERONI + ln_ln_n) / (2.0 * sqrt_2_ln_n))
265
+ expected_max_sr_annualized = expected_max_sr_daily * math.sqrt(_TRADING_DAYS_PER_YEAR)
266
+
267
+ psr_result = compute_psr(r, sharpe_benchmark=expected_max_sr_annualized)
268
+
269
+ return {
270
+ "status": psr_result["status"],
271
+ "n": n,
272
+ "sharpe": sr_annualized,
273
+ "n_trials": n_trials,
274
+ "sharpe_benchmark": expected_max_sr_annualized,
275
+ "dsr": psr_result.get("psr", 0.5),
276
+ "skew": skew,
277
+ "kurtosis": kurt_excess,
278
+ }
@@ -0,0 +1,161 @@
1
+ """expectancy — hit rate × win/loss ratio decomposition.
2
+
3
+ The single most diagnostic breakdown for distinguishing skilled vs unskilled
4
+ risk-taking:
5
+
6
+ - **Selection skill**: high hit rate with symmetric W/L ratio (~1.0) — picks
7
+ winners more often than losers, magnitudes about equal.
8
+ - **Convexity skill**: moderate hit rate with W/L ratio > 1.5 — rides winners
9
+ and cuts losers; total expectancy positive even with <50% hit rate.
10
+ - **No skill (just YOLO into vol)**: declining hit rate with no compensating
11
+ W/L improvement → expectancy ≤ 0.
12
+
13
+ Formula:
14
+ expectancy = hit_rate * avg_win - (1 - hit_rate) * avg_loss
15
+
16
+ where avg_win is the mean of positive returns (or alpha) and avg_loss is the
17
+ mean *magnitude* of negative returns. Reports both expectancy and the
18
+ decomposition components so consumers can see WHICH dimension is failing.
19
+
20
+ Pure-compute. Operates on a returns array; no I/O.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+ from typing import TypedDict
27
+
28
+ import numpy as np
29
+ import pandas as pd
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class ExpectancyResult(TypedDict, total=False):
35
+ status: str
36
+ n: int
37
+ hit_rate: float
38
+ avg_win: float
39
+ avg_loss: float # magnitude (positive number)
40
+ win_loss_ratio: float
41
+ expectancy: float
42
+ expectancy_per_unit_loss: float # expectancy / avg_loss; the "R-multiple" expectancy
43
+
44
+
45
+ def compute_expectancy(
46
+ returns: pd.Series | np.ndarray,
47
+ threshold: float = 0.0,
48
+ min_samples: int = 10,
49
+ ) -> ExpectancyResult:
50
+ """Compute expectancy decomposition over a return series.
51
+
52
+ Parameters
53
+ ----------
54
+ returns : pd.Series or np.ndarray
55
+ Per-trade or per-pick returns (or alphas). NaN dropped.
56
+ threshold : float
57
+ Win/loss boundary. Default 0 → wins are positive returns. Set
58
+ non-zero to compute relative to a benchmark return (e.g.
59
+ threshold = SPY_return for "did we beat SPY?" expectancy).
60
+ min_samples : int
61
+ Minimum non-NaN samples required to compute. Returns
62
+ status=insufficient_data below this floor. Default 10.
63
+
64
+ Returns
65
+ -------
66
+ ExpectancyResult dict with:
67
+ status: "ok" | "insufficient_data" | "no_wins" | "no_losses"
68
+ n: sample size
69
+ hit_rate: fraction of trades where return > threshold
70
+ avg_win: mean of returns > threshold (None if no wins)
71
+ avg_loss: mean magnitude of returns <= threshold (None if no losses)
72
+ win_loss_ratio: avg_win / avg_loss (None if either is missing/zero)
73
+ expectancy: hit_rate * avg_win - (1 - hit_rate) * avg_loss
74
+ expectancy_per_unit_loss: expectancy / avg_loss (R-multiple form)
75
+
76
+ Notes
77
+ -----
78
+ The R-multiple form (expectancy / avg_loss) is the "expectancy per unit of
79
+ risk taken" — useful for comparing across regimes where absolute return
80
+ levels shift but the ratio of skilled-edge-to-typical-loss is the
81
+ invariant signal of skill.
82
+ """
83
+ arr = np.asarray(returns, dtype=np.float64)
84
+ arr = arr[np.isfinite(arr)]
85
+ n = arr.size
86
+ if n < min_samples:
87
+ return {"status": "insufficient_data", "n": n}
88
+
89
+ excess = arr - threshold
90
+ wins = excess[excess > 0]
91
+ losses = excess[excess <= 0]
92
+
93
+ hit_rate = float(wins.size) / n
94
+
95
+ if wins.size == 0:
96
+ return {
97
+ "status": "no_wins",
98
+ "n": n,
99
+ "hit_rate": 0.0,
100
+ "avg_win": None, # type: ignore[typeddict-item]
101
+ "avg_loss": float(-losses.mean()) if losses.size else 0.0,
102
+ "win_loss_ratio": None, # type: ignore[typeddict-item]
103
+ "expectancy": float(excess.mean()),
104
+ "expectancy_per_unit_loss": None, # type: ignore[typeddict-item]
105
+ }
106
+
107
+ if losses.size == 0:
108
+ return {
109
+ "status": "no_losses",
110
+ "n": n,
111
+ "hit_rate": 1.0,
112
+ "avg_win": float(wins.mean()),
113
+ "avg_loss": 0.0,
114
+ "win_loss_ratio": None, # type: ignore[typeddict-item]
115
+ "expectancy": float(excess.mean()),
116
+ "expectancy_per_unit_loss": None, # type: ignore[typeddict-item]
117
+ }
118
+
119
+ avg_win = float(wins.mean())
120
+ avg_loss = float(-losses.mean()) # report as positive magnitude
121
+ win_loss_ratio = avg_win / avg_loss if avg_loss > 0 else None
122
+ expectancy = hit_rate * avg_win - (1.0 - hit_rate) * avg_loss
123
+ epul = expectancy / avg_loss if avg_loss > 0 else None
124
+
125
+ return {
126
+ "status": "ok",
127
+ "n": n,
128
+ "hit_rate": hit_rate,
129
+ "avg_win": avg_win,
130
+ "avg_loss": avg_loss,
131
+ "win_loss_ratio": win_loss_ratio, # type: ignore[typeddict-item]
132
+ "expectancy": expectancy,
133
+ "expectancy_per_unit_loss": epul, # type: ignore[typeddict-item]
134
+ }
135
+
136
+
137
+ def compute_expectancy_by_group(
138
+ df: pd.DataFrame,
139
+ return_col: str,
140
+ group_col: str,
141
+ threshold: float = 0.0,
142
+ min_samples: int = 10,
143
+ ) -> dict[str, ExpectancyResult]:
144
+ """Stratify expectancy by a grouping column (team_id, conviction, sector, ...).
145
+
146
+ Returns
147
+ -------
148
+ dict[group_value, ExpectancyResult]
149
+ One result per group. Groups below ``min_samples`` get
150
+ status=insufficient_data entries.
151
+ """
152
+ if return_col not in df.columns:
153
+ raise KeyError(f"return_col {return_col!r} not in dataframe")
154
+ if group_col not in df.columns:
155
+ raise KeyError(f"group_col {group_col!r} not in dataframe")
156
+ out: dict[str, ExpectancyResult] = {}
157
+ for group_value, sub in df.groupby(group_col):
158
+ out[str(group_value)] = compute_expectancy(
159
+ sub[return_col], threshold=threshold, min_samples=min_samples,
160
+ )
161
+ return out
@@ -0,0 +1,149 @@
1
+ """information_coefficient — Spearman rank correlation between a conviction/
2
+ score and forward returns.
3
+
4
+ The risk-invariant quality metric. IC of 0.05 on conservative picks vs 0.02 on
5
+ aggressive picks tells you the signal is *worse* at the harder task even if
6
+ absolute returns went up.
7
+
8
+ Why Spearman over Pearson: rank correlation is invariant to the scale and
9
+ distribution of the conviction scores — they only need to *order* picks
10
+ correctly; absolute calibration is a separate concern.
11
+
12
+ Pure-compute. Operates on parallel arrays of (conviction, return); no I/O.
13
+ scipy.stats.spearmanr is used when available (for the p-value); otherwise a
14
+ numpy rank-then-Pearson fallback gives the identical IC with no p-value.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ from typing import TypedDict
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class ICResult(TypedDict, total=False):
29
+ status: str
30
+ n: int
31
+ ic: float # Spearman rank correlation
32
+ p_value: float # two-sided p-value vs null hypothesis IC = 0
33
+ n_buckets: int # number of distinct conviction levels
34
+
35
+
36
+ def compute_ic(
37
+ conviction: pd.Series | np.ndarray,
38
+ forward_return: pd.Series | np.ndarray,
39
+ min_samples: int = 20,
40
+ ) -> ICResult:
41
+ """Compute Spearman rank correlation between conviction + forward return.
42
+
43
+ Parameters
44
+ ----------
45
+ conviction : array-like
46
+ Stated conviction or composite score per pick. Higher = more
47
+ confident. Numeric or ordinal (string ranks need to be encoded
48
+ upstream — Spearman is on the rank, but the array must sort
49
+ meaningfully).
50
+ forward_return : array-like
51
+ Realized forward return per pick over the prediction horizon.
52
+ Same length as ``conviction``. NaN in either column → that pair
53
+ is dropped before computing.
54
+ min_samples : int
55
+ Minimum valid pairs required. Below this floor returns
56
+ status=insufficient_data. Default 20 (Spearman p-values are
57
+ unreliable on small samples).
58
+
59
+ Returns
60
+ -------
61
+ ICResult dict with:
62
+ status: "ok" | "insufficient_data" | "no_variance"
63
+ n: number of (conviction, return) pairs after NaN filtering
64
+ ic: Spearman rank correlation in [-1, 1]
65
+ p_value: two-sided p-value
66
+ n_buckets: number of distinct conviction levels (low = collapsed
67
+ conviction signal — the source isn't differentiating
68
+ even if IC happens to look fine)
69
+
70
+ Notes
71
+ -----
72
+ - Uses scipy.stats.spearmanr if available; falls back to numpy
73
+ rank-then-pearson which gives identical IC but no p-value.
74
+ - n_buckets is reported separately so callers can flag the
75
+ degenerate case where conviction = constant (IC undefined;
76
+ correlation of constant with anything is 0).
77
+ """
78
+ c = np.asarray(conviction, dtype=np.float64)
79
+ r = np.asarray(forward_return, dtype=np.float64)
80
+ if c.size != r.size:
81
+ raise ValueError(
82
+ f"conviction (n={c.size}) and forward_return (n={r.size}) "
83
+ "must be same length"
84
+ )
85
+ valid = np.isfinite(c) & np.isfinite(r)
86
+ c = c[valid]
87
+ r = r[valid]
88
+ n = c.size
89
+ if n < min_samples:
90
+ return {"status": "insufficient_data", "n": n}
91
+
92
+ n_buckets = int(np.unique(c).size)
93
+ # min == max is the exact constancy test (avoids float64 std residual).
94
+ c_const = c.size > 0 and c.min() == c.max()
95
+ r_const = r.size > 0 and r.min() == r.max()
96
+ if n_buckets < 2 or c_const or r_const:
97
+ return {
98
+ "status": "no_variance",
99
+ "n": n,
100
+ "n_buckets": n_buckets,
101
+ "ic": 0.0,
102
+ "p_value": 1.0,
103
+ }
104
+
105
+ try:
106
+ from scipy.stats import spearmanr # type: ignore[import-not-found]
107
+
108
+ result = spearmanr(c, r)
109
+ ic = float(result.statistic)
110
+ p_value = float(result.pvalue)
111
+ except Exception:
112
+ # Fallback: rank-then-pearson. Identical IC value; no p-value.
113
+ c_rank = pd.Series(c).rank().to_numpy()
114
+ r_rank = pd.Series(r).rank().to_numpy()
115
+ ic = float(np.corrcoef(c_rank, r_rank)[0, 1])
116
+ p_value = float("nan")
117
+
118
+ return {
119
+ "status": "ok",
120
+ "n": n,
121
+ "n_buckets": n_buckets,
122
+ "ic": ic,
123
+ "p_value": p_value,
124
+ }
125
+
126
+
127
+ def compute_ic_by_bucket(
128
+ df: pd.DataFrame,
129
+ conviction_col: str,
130
+ return_col: str,
131
+ bucket_col: str,
132
+ min_samples: int = 20,
133
+ ) -> dict[str, ICResult]:
134
+ """IC stratified by a bucket column (sector, conviction tier, regime, ...).
135
+
136
+ The "is the signal good at the harder task" cut: split by conviction
137
+ decile or sector and compute IC within each. A signal whose IC drops
138
+ on its highest-conviction picks is failing exactly where it claims
139
+ to be most confident.
140
+ """
141
+ for col in (conviction_col, return_col, bucket_col):
142
+ if col not in df.columns:
143
+ raise KeyError(f"column {col!r} not in dataframe")
144
+ out: dict[str, ICResult] = {}
145
+ for bucket_value, sub in df.groupby(bucket_col):
146
+ out[str(bucket_value)] = compute_ic(
147
+ sub[conviction_col], sub[return_col], min_samples=min_samples,
148
+ )
149
+ return out