rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rangebar/CLAUDE.md +327 -0
- rangebar/__init__.py +227 -0
- rangebar/__init__.pyi +1089 -0
- rangebar/_core.cpython-313-darwin.so +0 -0
- rangebar/checkpoint.py +472 -0
- rangebar/cli.py +298 -0
- rangebar/clickhouse/CLAUDE.md +139 -0
- rangebar/clickhouse/__init__.py +100 -0
- rangebar/clickhouse/bulk_operations.py +309 -0
- rangebar/clickhouse/cache.py +734 -0
- rangebar/clickhouse/client.py +121 -0
- rangebar/clickhouse/config.py +141 -0
- rangebar/clickhouse/mixin.py +120 -0
- rangebar/clickhouse/preflight.py +504 -0
- rangebar/clickhouse/query_operations.py +345 -0
- rangebar/clickhouse/schema.sql +187 -0
- rangebar/clickhouse/tunnel.py +222 -0
- rangebar/constants.py +288 -0
- rangebar/conversion.py +177 -0
- rangebar/exceptions.py +207 -0
- rangebar/exness.py +364 -0
- rangebar/hooks.py +311 -0
- rangebar/logging.py +171 -0
- rangebar/notify/__init__.py +15 -0
- rangebar/notify/pushover.py +155 -0
- rangebar/notify/telegram.py +271 -0
- rangebar/orchestration/__init__.py +20 -0
- rangebar/orchestration/count_bounded.py +797 -0
- rangebar/orchestration/helpers.py +412 -0
- rangebar/orchestration/models.py +76 -0
- rangebar/orchestration/precompute.py +498 -0
- rangebar/orchestration/range_bars.py +736 -0
- rangebar/orchestration/tick_fetcher.py +226 -0
- rangebar/ouroboros.py +454 -0
- rangebar/processors/__init__.py +22 -0
- rangebar/processors/api.py +383 -0
- rangebar/processors/core.py +522 -0
- rangebar/resource_guard.py +567 -0
- rangebar/storage/__init__.py +22 -0
- rangebar/storage/checksum_registry.py +218 -0
- rangebar/storage/parquet.py +728 -0
- rangebar/streaming.py +300 -0
- rangebar/validation/__init__.py +69 -0
- rangebar/validation/cache_staleness.py +277 -0
- rangebar/validation/continuity.py +664 -0
- rangebar/validation/gap_classification.py +294 -0
- rangebar/validation/post_storage.py +317 -0
- rangebar/validation/tier1.py +175 -0
- rangebar/validation/tier2.py +261 -0
- rangebar-11.6.1.dist-info/METADATA +308 -0
- rangebar-11.6.1.dist-info/RECORD +54 -0
- rangebar-11.6.1.dist-info/WHEEL +4 -0
- rangebar-11.6.1.dist-info/entry_points.txt +2 -0
- rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Tier 1: Auto-validation suite for microstructure features (<30 sec).
|
|
2
|
+
|
|
3
|
+
Run on every precompute to catch data quality issues early.
|
|
4
|
+
This is the smoke test - fast checks that should always pass.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
# Microstructure feature columns (Issue #25)
|
|
17
|
+
FEATURE_COLS = [
|
|
18
|
+
"duration_us",
|
|
19
|
+
"ofi",
|
|
20
|
+
"vwap_close_deviation",
|
|
21
|
+
"price_impact",
|
|
22
|
+
"kyle_lambda_proxy",
|
|
23
|
+
"trade_intensity",
|
|
24
|
+
"volume_per_trade",
|
|
25
|
+
"aggression_ratio",
|
|
26
|
+
"aggregation_density",
|
|
27
|
+
"turnover_imbalance",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
# Validation thresholds
|
|
31
|
+
MIN_SAMPLES_FOR_CORRELATION = 50
|
|
32
|
+
OFI_CORR_MIN = 0.05
|
|
33
|
+
OFI_CORR_MAX = 0.8
|
|
34
|
+
OFI_MEAN_THRESHOLD = 0.3
|
|
35
|
+
AGGRESSION_RATIO_MAX_MEDIAN = 10
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _check_bounds(df: pd.DataFrame, results: dict) -> None:
|
|
39
|
+
"""Check feature bounds and populate results dict."""
|
|
40
|
+
# Duration should be non-negative
|
|
41
|
+
if "duration_us" in df.columns:
|
|
42
|
+
results["duration_positive"] = (df["duration_us"] >= 0).all()
|
|
43
|
+
else:
|
|
44
|
+
results["duration_positive"] = True # N/A
|
|
45
|
+
|
|
46
|
+
# OFI should be bounded [-1, 1]
|
|
47
|
+
if "ofi" in df.columns:
|
|
48
|
+
results["ofi_bounded"] = df["ofi"].between(-1, 1).all()
|
|
49
|
+
else:
|
|
50
|
+
results["ofi_bounded"] = True # N/A
|
|
51
|
+
|
|
52
|
+
# Turnover imbalance should be bounded [-1, 1]
|
|
53
|
+
if "turnover_imbalance" in df.columns:
|
|
54
|
+
results["turnover_imbalance_bounded"] = (
|
|
55
|
+
df["turnover_imbalance"].between(-1, 1).all()
|
|
56
|
+
)
|
|
57
|
+
else:
|
|
58
|
+
results["turnover_imbalance_bounded"] = True # N/A
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _check_correlation(df: pd.DataFrame, results: dict) -> None:
|
|
62
|
+
"""Check OFI-return correlation sanity."""
|
|
63
|
+
if (
|
|
64
|
+
"Close" in df.columns
|
|
65
|
+
and "ofi" in df.columns
|
|
66
|
+
and len(df) > MIN_SAMPLES_FOR_CORRELATION
|
|
67
|
+
):
|
|
68
|
+
returns = df["Close"].pct_change()
|
|
69
|
+
ofi_return_corr = df["ofi"].corr(returns)
|
|
70
|
+
results["ofi_return_corr"] = (
|
|
71
|
+
float(ofi_return_corr) if not np.isnan(ofi_return_corr) else None
|
|
72
|
+
)
|
|
73
|
+
# OFI should have some correlation with returns
|
|
74
|
+
if results["ofi_return_corr"] is not None:
|
|
75
|
+
corr_abs = abs(results["ofi_return_corr"])
|
|
76
|
+
results["ofi_corr_sane"] = OFI_CORR_MIN < corr_abs < OFI_CORR_MAX
|
|
77
|
+
else:
|
|
78
|
+
results["ofi_corr_sane"] = None
|
|
79
|
+
else:
|
|
80
|
+
results["ofi_return_corr"] = None
|
|
81
|
+
results["ofi_corr_sane"] = None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _check_distributions(df: pd.DataFrame, results: dict) -> None:
|
|
85
|
+
"""Check basic distribution properties."""
|
|
86
|
+
if "ofi" in df.columns:
|
|
87
|
+
results["ofi_mean_near_zero"] = abs(df["ofi"].mean()) < OFI_MEAN_THRESHOLD
|
|
88
|
+
else:
|
|
89
|
+
results["ofi_mean_near_zero"] = True # N/A
|
|
90
|
+
|
|
91
|
+
if "aggression_ratio" in df.columns:
|
|
92
|
+
results["aggression_ratio_reasonable"] = (
|
|
93
|
+
df["aggression_ratio"].median() < AGGRESSION_RATIO_MAX_MEDIAN
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
results["aggression_ratio_reasonable"] = True # N/A
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def validate_tier1(df: pd.DataFrame) -> dict:
|
|
100
|
+
"""Auto-validation suite (<30 sec). Run on every precompute.
|
|
101
|
+
|
|
102
|
+
Validates basic data quality for microstructure features:
|
|
103
|
+
- No NaN/Inf values
|
|
104
|
+
- Bounded features within expected ranges
|
|
105
|
+
- Basic correlation sanity with returns (if available)
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
df : pd.DataFrame
|
|
110
|
+
Range bar DataFrame with microstructure columns
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
dict
|
|
115
|
+
Validation results with individual check results and overall pass/fail.
|
|
116
|
+
Keys include:
|
|
117
|
+
- no_nan: bool - No NaN values in feature columns
|
|
118
|
+
- no_inf: bool - No Inf values in feature columns
|
|
119
|
+
- duration_positive: bool - All durations >= 0
|
|
120
|
+
- ofi_bounded: bool - OFI in [-1, 1]
|
|
121
|
+
- turnover_imbalance_bounded: bool - Turnover imbalance in [-1, 1]
|
|
122
|
+
- ofi_return_corr: float | None - OFI-return correlation if computable
|
|
123
|
+
- ofi_corr_sane: bool | None - OFI correlation within expected range
|
|
124
|
+
- ofi_mean_near_zero: bool - OFI mean close to 0 (balanced market)
|
|
125
|
+
- aggression_ratio_reasonable: bool - Median aggression ratio < 10
|
|
126
|
+
- tier1_passed: bool - All critical checks passed
|
|
127
|
+
|
|
128
|
+
Examples
|
|
129
|
+
--------
|
|
130
|
+
>>> from rangebar import get_range_bars
|
|
131
|
+
>>> from rangebar.validation.tier1 import validate_tier1
|
|
132
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-01-02")
|
|
133
|
+
>>> result = validate_tier1(df)
|
|
134
|
+
>>> print("Tier 1:", "PASSED" if result["tier1_passed"] else "FAILED")
|
|
135
|
+
"""
|
|
136
|
+
results: dict = {}
|
|
137
|
+
|
|
138
|
+
# Check which columns are present
|
|
139
|
+
present_cols = [c for c in FEATURE_COLS if c in df.columns]
|
|
140
|
+
|
|
141
|
+
if not present_cols:
|
|
142
|
+
results["features_present"] = False
|
|
143
|
+
results["tier1_passed"] = False
|
|
144
|
+
results["error"] = "No microstructure feature columns found"
|
|
145
|
+
return results
|
|
146
|
+
|
|
147
|
+
results["features_present"] = True
|
|
148
|
+
results["features_found"] = present_cols
|
|
149
|
+
|
|
150
|
+
# 1. NaN/Inf checks
|
|
151
|
+
results["no_nan"] = not df[present_cols].isna().any().any()
|
|
152
|
+
results["no_inf"] = (
|
|
153
|
+
not np.isinf(df[present_cols].select_dtypes(include=[np.number])).any().any()
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# 2. Bounds checks
|
|
157
|
+
_check_bounds(df, results)
|
|
158
|
+
|
|
159
|
+
# 3. Correlation sanity
|
|
160
|
+
_check_correlation(df, results)
|
|
161
|
+
|
|
162
|
+
# 4. Distribution checks
|
|
163
|
+
_check_distributions(df, results)
|
|
164
|
+
|
|
165
|
+
# 5. Overall pass/fail
|
|
166
|
+
critical_checks = [
|
|
167
|
+
results["no_nan"],
|
|
168
|
+
results["no_inf"],
|
|
169
|
+
results["duration_positive"],
|
|
170
|
+
results["ofi_bounded"],
|
|
171
|
+
results["turnover_imbalance_bounded"],
|
|
172
|
+
]
|
|
173
|
+
results["tier1_passed"] = all(critical_checks)
|
|
174
|
+
|
|
175
|
+
return results
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Tier 2: Statistical validation before production ML (~10 min).
|
|
2
|
+
|
|
3
|
+
MANDATORY before ML training. Validates:
|
|
4
|
+
- Stationarity (ADF test)
|
|
5
|
+
- Predictive power (Spearman with forward returns)
|
|
6
|
+
- Mutual information with target
|
|
7
|
+
- Feature correlation matrix (redundancy check)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
from .tier1 import FEATURE_COLS, validate_tier1
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
# Validation thresholds
|
|
25
|
+
MIN_SAMPLES_FOR_ADF = 20
|
|
26
|
+
MIN_SAMPLES_FOR_SPEARMAN = 100
|
|
27
|
+
MIN_SAMPLES_FOR_MI = 50
|
|
28
|
+
ADF_SIGNIFICANCE_LEVEL = 0.05
|
|
29
|
+
SPEARMAN_MIN_CORRELATION = 0.02
|
|
30
|
+
SPEARMAN_SIGNIFICANCE_LEVEL = 0.05
|
|
31
|
+
HIGH_CORRELATION_THRESHOLD = 0.8
|
|
32
|
+
MIN_SIGNIFICANT_FEATURES = 3
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _run_stationarity_tests(
|
|
36
|
+
data: pd.DataFrame,
|
|
37
|
+
present_cols: list[str],
|
|
38
|
+
) -> dict:
|
|
39
|
+
"""Run ADF stationarity tests on feature columns."""
|
|
40
|
+
stationarity: dict = {}
|
|
41
|
+
try:
|
|
42
|
+
from statsmodels.tsa.stattools import adfuller
|
|
43
|
+
|
|
44
|
+
for col in present_cols:
|
|
45
|
+
series = data[col].dropna()
|
|
46
|
+
if len(series) < MIN_SAMPLES_FOR_ADF:
|
|
47
|
+
continue
|
|
48
|
+
try:
|
|
49
|
+
adf_result = adfuller(series, maxlag=5)
|
|
50
|
+
stationarity[col] = {
|
|
51
|
+
"adf_statistic": float(adf_result[0]),
|
|
52
|
+
"adf_p_value": float(adf_result[1]),
|
|
53
|
+
"is_stationary": adf_result[1] < ADF_SIGNIFICANCE_LEVEL,
|
|
54
|
+
}
|
|
55
|
+
except Exception as e:
|
|
56
|
+
logger.warning("ADF test failed for %s: %s", col, e)
|
|
57
|
+
stationarity[col] = {"error": str(e)}
|
|
58
|
+
except ImportError:
|
|
59
|
+
logger.warning("statsmodels not installed, skipping stationarity tests")
|
|
60
|
+
stationarity = {"error": "statsmodels not installed"}
|
|
61
|
+
|
|
62
|
+
return stationarity
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _run_predictive_power_tests(
|
|
66
|
+
data: pd.DataFrame,
|
|
67
|
+
present_cols: list[str],
|
|
68
|
+
target_col: str,
|
|
69
|
+
) -> dict:
|
|
70
|
+
"""Run Spearman correlation tests for predictive power."""
|
|
71
|
+
predictive: dict = {}
|
|
72
|
+
if target_col not in data.columns:
|
|
73
|
+
return predictive
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
from scipy.stats import spearmanr
|
|
77
|
+
|
|
78
|
+
for col in present_cols:
|
|
79
|
+
valid = data[[col, target_col]].dropna()
|
|
80
|
+
if len(valid) > MIN_SAMPLES_FOR_SPEARMAN:
|
|
81
|
+
try:
|
|
82
|
+
rho, p = spearmanr(valid[col], valid[target_col])
|
|
83
|
+
is_significant = (
|
|
84
|
+
abs(rho) > SPEARMAN_MIN_CORRELATION
|
|
85
|
+
and p < SPEARMAN_SIGNIFICANCE_LEVEL
|
|
86
|
+
if not np.isnan(rho)
|
|
87
|
+
else False
|
|
88
|
+
)
|
|
89
|
+
predictive[col] = {
|
|
90
|
+
"spearman_rho": float(rho) if not np.isnan(rho) else None,
|
|
91
|
+
"p_value": float(p) if not np.isnan(p) else None,
|
|
92
|
+
"significant": is_significant,
|
|
93
|
+
}
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning("Spearman test failed for %s: %s", col, e)
|
|
96
|
+
predictive[col] = {"error": str(e)}
|
|
97
|
+
except ImportError:
|
|
98
|
+
logger.warning("scipy not installed, skipping Spearman tests")
|
|
99
|
+
predictive = {"error": "scipy not installed"}
|
|
100
|
+
|
|
101
|
+
return predictive
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _run_mutual_information(
|
|
105
|
+
data: pd.DataFrame,
|
|
106
|
+
present_cols: list[str],
|
|
107
|
+
target_col: str,
|
|
108
|
+
) -> dict:
|
|
109
|
+
"""Compute mutual information scores."""
|
|
110
|
+
mutual_info: dict = {}
|
|
111
|
+
if target_col not in data.columns:
|
|
112
|
+
return mutual_info
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
from sklearn.feature_selection import mutual_info_regression
|
|
116
|
+
|
|
117
|
+
feature_data = data[present_cols].dropna()
|
|
118
|
+
target_data = data.loc[feature_data.index, target_col].dropna()
|
|
119
|
+
common_idx = feature_data.index.intersection(target_data.index)
|
|
120
|
+
|
|
121
|
+
if len(common_idx) > MIN_SAMPLES_FOR_SPEARMAN:
|
|
122
|
+
feature_data = feature_data.loc[common_idx]
|
|
123
|
+
target_data = target_data.loc[common_idx]
|
|
124
|
+
# Handle any remaining NaN after alignment
|
|
125
|
+
valid_mask = ~(feature_data.isna().any(axis=1) | target_data.isna())
|
|
126
|
+
feature_data = feature_data[valid_mask]
|
|
127
|
+
target_data = target_data[valid_mask]
|
|
128
|
+
|
|
129
|
+
if len(feature_data) > MIN_SAMPLES_FOR_MI:
|
|
130
|
+
mi_scores = mutual_info_regression(
|
|
131
|
+
feature_data, target_data, random_state=42
|
|
132
|
+
)
|
|
133
|
+
mutual_info = dict(
|
|
134
|
+
zip(present_cols, [float(s) for s in mi_scores], strict=False)
|
|
135
|
+
)
|
|
136
|
+
except ImportError:
|
|
137
|
+
logger.warning("sklearn not installed, skipping mutual information")
|
|
138
|
+
mutual_info = {"error": "sklearn not installed"}
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.warning("Mutual information failed: %s", e)
|
|
141
|
+
mutual_info = {"error": str(e)}
|
|
142
|
+
|
|
143
|
+
return mutual_info
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _find_high_correlation_pairs(
|
|
147
|
+
data: pd.DataFrame,
|
|
148
|
+
present_cols: list[str],
|
|
149
|
+
) -> list:
|
|
150
|
+
"""Find highly correlated feature pairs."""
|
|
151
|
+
high_corr_pairs: list = []
|
|
152
|
+
try:
|
|
153
|
+
corr_matrix = data[present_cols].corr(method="spearman")
|
|
154
|
+
for i, col1 in enumerate(present_cols):
|
|
155
|
+
for col2 in present_cols[i + 1 :]:
|
|
156
|
+
if col1 in corr_matrix.columns and col2 in corr_matrix.columns:
|
|
157
|
+
corr_val = corr_matrix.loc[col1, col2]
|
|
158
|
+
if (
|
|
159
|
+
not np.isnan(corr_val)
|
|
160
|
+
and abs(corr_val) > HIGH_CORRELATION_THRESHOLD
|
|
161
|
+
):
|
|
162
|
+
high_corr_pairs.append((col1, col2, float(corr_val)))
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.warning("Correlation matrix failed: %s", e)
|
|
165
|
+
|
|
166
|
+
return high_corr_pairs
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def validate_tier2(
|
|
170
|
+
df: pd.DataFrame,
|
|
171
|
+
target_col: str = "forward_return",
|
|
172
|
+
) -> dict:
|
|
173
|
+
"""Statistical validation (~10 min). MANDATORY before ML training.
|
|
174
|
+
|
|
175
|
+
Extends Tier 1 with statistical tests:
|
|
176
|
+
- Stationarity (Augmented Dickey-Fuller test)
|
|
177
|
+
- Predictive power (Spearman correlation with forward returns)
|
|
178
|
+
- Mutual information scores
|
|
179
|
+
- Feature correlation matrix (redundancy detection)
|
|
180
|
+
|
|
181
|
+
Parameters
|
|
182
|
+
----------
|
|
183
|
+
df : pd.DataFrame
|
|
184
|
+
Range bar DataFrame with microstructure columns.
|
|
185
|
+
Should include forward_return column or Close for computing returns.
|
|
186
|
+
target_col : str, default="forward_return"
|
|
187
|
+
Column name for the prediction target.
|
|
188
|
+
If not present, will be computed from Close if available.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
dict
|
|
193
|
+
Validation results including:
|
|
194
|
+
- All Tier 1 results
|
|
195
|
+
- stationarity: dict[str, dict] - ADF results per feature
|
|
196
|
+
- predictive_power: dict[str, dict] - Spearman correlation results
|
|
197
|
+
- mutual_info: dict[str, float] - MI scores (if sklearn available)
|
|
198
|
+
- high_correlation_pairs: list[tuple] - Highly correlated feature pairs
|
|
199
|
+
- tier2_passed: bool - Tier 2 validation passed
|
|
200
|
+
|
|
201
|
+
Examples
|
|
202
|
+
--------
|
|
203
|
+
>>> from rangebar import get_range_bars
|
|
204
|
+
>>> from rangebar.validation.tier2 import validate_tier2
|
|
205
|
+
>>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-01-07")
|
|
206
|
+
>>> df["forward_return"] = df["Close"].shift(-1) / df["Close"] - 1
|
|
207
|
+
>>> result = validate_tier2(df)
|
|
208
|
+
>>> print("Tier 2:", "PASSED" if result["tier2_passed"] else "FAILED")
|
|
209
|
+
"""
|
|
210
|
+
# Start with Tier 1 results
|
|
211
|
+
results = validate_tier1(df)
|
|
212
|
+
|
|
213
|
+
if not results.get("tier1_passed", False):
|
|
214
|
+
results["tier2_passed"] = False
|
|
215
|
+
return results
|
|
216
|
+
|
|
217
|
+
# Check which columns are present
|
|
218
|
+
present_cols = [c for c in FEATURE_COLS if c in df.columns]
|
|
219
|
+
|
|
220
|
+
if not present_cols:
|
|
221
|
+
results["tier2_passed"] = False
|
|
222
|
+
return results
|
|
223
|
+
|
|
224
|
+
# Compute forward return if not present
|
|
225
|
+
working_df = df
|
|
226
|
+
if target_col not in df.columns and "Close" in df.columns:
|
|
227
|
+
working_df = df.copy()
|
|
228
|
+
working_df[target_col] = working_df["Close"].shift(-1) / working_df["Close"] - 1
|
|
229
|
+
|
|
230
|
+
# 1. Stationarity tests (ADF)
|
|
231
|
+
results["stationarity"] = _run_stationarity_tests(working_df, present_cols)
|
|
232
|
+
|
|
233
|
+
# 2. Predictive power (Spearman with forward returns)
|
|
234
|
+
predictive = _run_predictive_power_tests(working_df, present_cols, target_col)
|
|
235
|
+
results["predictive_power"] = predictive
|
|
236
|
+
|
|
237
|
+
# 3. Mutual information
|
|
238
|
+
results["mutual_info"] = _run_mutual_information(
|
|
239
|
+
working_df, present_cols, target_col
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# 4. Correlation matrix (check redundancy)
|
|
243
|
+
results["high_correlation_pairs"] = _find_high_correlation_pairs(
|
|
244
|
+
working_df, present_cols
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# 5. Tier 2 pass criteria
|
|
248
|
+
# At least 3 features should have significant predictive power
|
|
249
|
+
significant_features = sum(
|
|
250
|
+
1
|
|
251
|
+
for v in predictive.values()
|
|
252
|
+
if isinstance(v, dict) and v.get("significant", False)
|
|
253
|
+
)
|
|
254
|
+
results["significant_feature_count"] = significant_features
|
|
255
|
+
|
|
256
|
+
results["tier2_passed"] = (
|
|
257
|
+
results.get("tier1_passed", False)
|
|
258
|
+
and significant_features >= MIN_SIGNIFICANT_FEATURES
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return results
|