rangebar 11.6.1__cp313-cp313-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. rangebar/CLAUDE.md +327 -0
  2. rangebar/__init__.py +227 -0
  3. rangebar/__init__.pyi +1089 -0
  4. rangebar/_core.cpython-313-darwin.so +0 -0
  5. rangebar/checkpoint.py +472 -0
  6. rangebar/cli.py +298 -0
  7. rangebar/clickhouse/CLAUDE.md +139 -0
  8. rangebar/clickhouse/__init__.py +100 -0
  9. rangebar/clickhouse/bulk_operations.py +309 -0
  10. rangebar/clickhouse/cache.py +734 -0
  11. rangebar/clickhouse/client.py +121 -0
  12. rangebar/clickhouse/config.py +141 -0
  13. rangebar/clickhouse/mixin.py +120 -0
  14. rangebar/clickhouse/preflight.py +504 -0
  15. rangebar/clickhouse/query_operations.py +345 -0
  16. rangebar/clickhouse/schema.sql +187 -0
  17. rangebar/clickhouse/tunnel.py +222 -0
  18. rangebar/constants.py +288 -0
  19. rangebar/conversion.py +177 -0
  20. rangebar/exceptions.py +207 -0
  21. rangebar/exness.py +364 -0
  22. rangebar/hooks.py +311 -0
  23. rangebar/logging.py +171 -0
  24. rangebar/notify/__init__.py +15 -0
  25. rangebar/notify/pushover.py +155 -0
  26. rangebar/notify/telegram.py +271 -0
  27. rangebar/orchestration/__init__.py +20 -0
  28. rangebar/orchestration/count_bounded.py +797 -0
  29. rangebar/orchestration/helpers.py +412 -0
  30. rangebar/orchestration/models.py +76 -0
  31. rangebar/orchestration/precompute.py +498 -0
  32. rangebar/orchestration/range_bars.py +736 -0
  33. rangebar/orchestration/tick_fetcher.py +226 -0
  34. rangebar/ouroboros.py +454 -0
  35. rangebar/processors/__init__.py +22 -0
  36. rangebar/processors/api.py +383 -0
  37. rangebar/processors/core.py +522 -0
  38. rangebar/resource_guard.py +567 -0
  39. rangebar/storage/__init__.py +22 -0
  40. rangebar/storage/checksum_registry.py +218 -0
  41. rangebar/storage/parquet.py +728 -0
  42. rangebar/streaming.py +300 -0
  43. rangebar/validation/__init__.py +69 -0
  44. rangebar/validation/cache_staleness.py +277 -0
  45. rangebar/validation/continuity.py +664 -0
  46. rangebar/validation/gap_classification.py +294 -0
  47. rangebar/validation/post_storage.py +317 -0
  48. rangebar/validation/tier1.py +175 -0
  49. rangebar/validation/tier2.py +261 -0
  50. rangebar-11.6.1.dist-info/METADATA +308 -0
  51. rangebar-11.6.1.dist-info/RECORD +54 -0
  52. rangebar-11.6.1.dist-info/WHEEL +4 -0
  53. rangebar-11.6.1.dist-info/entry_points.txt +2 -0
  54. rangebar-11.6.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,175 @@
1
+ """Tier 1: Auto-validation suite for microstructure features (<30 sec).
2
+
3
+ Run on every precompute to catch data quality issues early.
4
+ This is the smoke test - fast checks that should always pass.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING
10
+
11
+ import numpy as np
12
+
13
+ if TYPE_CHECKING:
14
+ import pandas as pd
15
+
16
+ # Microstructure feature columns (Issue #25)
17
+ FEATURE_COLS = [
18
+ "duration_us",
19
+ "ofi",
20
+ "vwap_close_deviation",
21
+ "price_impact",
22
+ "kyle_lambda_proxy",
23
+ "trade_intensity",
24
+ "volume_per_trade",
25
+ "aggression_ratio",
26
+ "aggregation_density",
27
+ "turnover_imbalance",
28
+ ]
29
+
30
+ # Validation thresholds
31
+ MIN_SAMPLES_FOR_CORRELATION = 50
32
+ OFI_CORR_MIN = 0.05
33
+ OFI_CORR_MAX = 0.8
34
+ OFI_MEAN_THRESHOLD = 0.3
35
+ AGGRESSION_RATIO_MAX_MEDIAN = 10
36
+
37
+
38
+ def _check_bounds(df: pd.DataFrame, results: dict) -> None:
39
+ """Check feature bounds and populate results dict."""
40
+ # Duration should be non-negative
41
+ if "duration_us" in df.columns:
42
+ results["duration_positive"] = (df["duration_us"] >= 0).all()
43
+ else:
44
+ results["duration_positive"] = True # N/A
45
+
46
+ # OFI should be bounded [-1, 1]
47
+ if "ofi" in df.columns:
48
+ results["ofi_bounded"] = df["ofi"].between(-1, 1).all()
49
+ else:
50
+ results["ofi_bounded"] = True # N/A
51
+
52
+ # Turnover imbalance should be bounded [-1, 1]
53
+ if "turnover_imbalance" in df.columns:
54
+ results["turnover_imbalance_bounded"] = (
55
+ df["turnover_imbalance"].between(-1, 1).all()
56
+ )
57
+ else:
58
+ results["turnover_imbalance_bounded"] = True # N/A
59
+
60
+
61
+ def _check_correlation(df: pd.DataFrame, results: dict) -> None:
62
+ """Check OFI-return correlation sanity."""
63
+ if (
64
+ "Close" in df.columns
65
+ and "ofi" in df.columns
66
+ and len(df) > MIN_SAMPLES_FOR_CORRELATION
67
+ ):
68
+ returns = df["Close"].pct_change()
69
+ ofi_return_corr = df["ofi"].corr(returns)
70
+ results["ofi_return_corr"] = (
71
+ float(ofi_return_corr) if not np.isnan(ofi_return_corr) else None
72
+ )
73
+ # OFI should have some correlation with returns
74
+ if results["ofi_return_corr"] is not None:
75
+ corr_abs = abs(results["ofi_return_corr"])
76
+ results["ofi_corr_sane"] = OFI_CORR_MIN < corr_abs < OFI_CORR_MAX
77
+ else:
78
+ results["ofi_corr_sane"] = None
79
+ else:
80
+ results["ofi_return_corr"] = None
81
+ results["ofi_corr_sane"] = None
82
+
83
+
84
+ def _check_distributions(df: pd.DataFrame, results: dict) -> None:
85
+ """Check basic distribution properties."""
86
+ if "ofi" in df.columns:
87
+ results["ofi_mean_near_zero"] = abs(df["ofi"].mean()) < OFI_MEAN_THRESHOLD
88
+ else:
89
+ results["ofi_mean_near_zero"] = True # N/A
90
+
91
+ if "aggression_ratio" in df.columns:
92
+ results["aggression_ratio_reasonable"] = (
93
+ df["aggression_ratio"].median() < AGGRESSION_RATIO_MAX_MEDIAN
94
+ )
95
+ else:
96
+ results["aggression_ratio_reasonable"] = True # N/A
97
+
98
+
99
+ def validate_tier1(df: pd.DataFrame) -> dict:
100
+ """Auto-validation suite (<30 sec). Run on every precompute.
101
+
102
+ Validates basic data quality for microstructure features:
103
+ - No NaN/Inf values
104
+ - Bounded features within expected ranges
105
+ - Basic correlation sanity with returns (if available)
106
+
107
+ Parameters
108
+ ----------
109
+ df : pd.DataFrame
110
+ Range bar DataFrame with microstructure columns
111
+
112
+ Returns
113
+ -------
114
+ dict
115
+ Validation results with individual check results and overall pass/fail.
116
+ Keys include:
117
+ - no_nan: bool - No NaN values in feature columns
118
+ - no_inf: bool - No Inf values in feature columns
119
+ - duration_positive: bool - All durations >= 0
120
+ - ofi_bounded: bool - OFI in [-1, 1]
121
+ - turnover_imbalance_bounded: bool - Turnover imbalance in [-1, 1]
122
+ - ofi_return_corr: float | None - OFI-return correlation if computable
123
+ - ofi_corr_sane: bool | None - OFI correlation within expected range
124
+ - ofi_mean_near_zero: bool - OFI mean close to 0 (balanced market)
125
+ - aggression_ratio_reasonable: bool - Median aggression ratio < 10
126
+ - tier1_passed: bool - All critical checks passed
127
+
128
+ Examples
129
+ --------
130
+ >>> from rangebar import get_range_bars
131
+ >>> from rangebar.validation.tier1 import validate_tier1
132
+ >>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-01-02")
133
+ >>> result = validate_tier1(df)
134
+ >>> print("Tier 1:", "PASSED" if result["tier1_passed"] else "FAILED")
135
+ """
136
+ results: dict = {}
137
+
138
+ # Check which columns are present
139
+ present_cols = [c for c in FEATURE_COLS if c in df.columns]
140
+
141
+ if not present_cols:
142
+ results["features_present"] = False
143
+ results["tier1_passed"] = False
144
+ results["error"] = "No microstructure feature columns found"
145
+ return results
146
+
147
+ results["features_present"] = True
148
+ results["features_found"] = present_cols
149
+
150
+ # 1. NaN/Inf checks
151
+ results["no_nan"] = not df[present_cols].isna().any().any()
152
+ results["no_inf"] = (
153
+ not np.isinf(df[present_cols].select_dtypes(include=[np.number])).any().any()
154
+ )
155
+
156
+ # 2. Bounds checks
157
+ _check_bounds(df, results)
158
+
159
+ # 3. Correlation sanity
160
+ _check_correlation(df, results)
161
+
162
+ # 4. Distribution checks
163
+ _check_distributions(df, results)
164
+
165
+ # 5. Overall pass/fail
166
+ critical_checks = [
167
+ results["no_nan"],
168
+ results["no_inf"],
169
+ results["duration_positive"],
170
+ results["ofi_bounded"],
171
+ results["turnover_imbalance_bounded"],
172
+ ]
173
+ results["tier1_passed"] = all(critical_checks)
174
+
175
+ return results
@@ -0,0 +1,261 @@
1
+ """Tier 2: Statistical validation before production ML (~10 min).
2
+
3
+ MANDATORY before ML training. Validates:
4
+ - Stationarity (ADF test)
5
+ - Predictive power (Spearman with forward returns)
6
+ - Mutual information with target
7
+ - Feature correlation matrix (redundancy check)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ from typing import TYPE_CHECKING
14
+
15
+ import numpy as np
16
+
17
+ from .tier1 import FEATURE_COLS, validate_tier1
18
+
19
+ if TYPE_CHECKING:
20
+ import pandas as pd
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ # Validation thresholds
25
+ MIN_SAMPLES_FOR_ADF = 20
26
+ MIN_SAMPLES_FOR_SPEARMAN = 100
27
+ MIN_SAMPLES_FOR_MI = 50
28
+ ADF_SIGNIFICANCE_LEVEL = 0.05
29
+ SPEARMAN_MIN_CORRELATION = 0.02
30
+ SPEARMAN_SIGNIFICANCE_LEVEL = 0.05
31
+ HIGH_CORRELATION_THRESHOLD = 0.8
32
+ MIN_SIGNIFICANT_FEATURES = 3
33
+
34
+
35
+ def _run_stationarity_tests(
36
+ data: pd.DataFrame,
37
+ present_cols: list[str],
38
+ ) -> dict:
39
+ """Run ADF stationarity tests on feature columns."""
40
+ stationarity: dict = {}
41
+ try:
42
+ from statsmodels.tsa.stattools import adfuller
43
+
44
+ for col in present_cols:
45
+ series = data[col].dropna()
46
+ if len(series) < MIN_SAMPLES_FOR_ADF:
47
+ continue
48
+ try:
49
+ adf_result = adfuller(series, maxlag=5)
50
+ stationarity[col] = {
51
+ "adf_statistic": float(adf_result[0]),
52
+ "adf_p_value": float(adf_result[1]),
53
+ "is_stationary": adf_result[1] < ADF_SIGNIFICANCE_LEVEL,
54
+ }
55
+ except Exception as e:
56
+ logger.warning("ADF test failed for %s: %s", col, e)
57
+ stationarity[col] = {"error": str(e)}
58
+ except ImportError:
59
+ logger.warning("statsmodels not installed, skipping stationarity tests")
60
+ stationarity = {"error": "statsmodels not installed"}
61
+
62
+ return stationarity
63
+
64
+
65
+ def _run_predictive_power_tests(
66
+ data: pd.DataFrame,
67
+ present_cols: list[str],
68
+ target_col: str,
69
+ ) -> dict:
70
+ """Run Spearman correlation tests for predictive power."""
71
+ predictive: dict = {}
72
+ if target_col not in data.columns:
73
+ return predictive
74
+
75
+ try:
76
+ from scipy.stats import spearmanr
77
+
78
+ for col in present_cols:
79
+ valid = data[[col, target_col]].dropna()
80
+ if len(valid) > MIN_SAMPLES_FOR_SPEARMAN:
81
+ try:
82
+ rho, p = spearmanr(valid[col], valid[target_col])
83
+ is_significant = (
84
+ abs(rho) > SPEARMAN_MIN_CORRELATION
85
+ and p < SPEARMAN_SIGNIFICANCE_LEVEL
86
+ if not np.isnan(rho)
87
+ else False
88
+ )
89
+ predictive[col] = {
90
+ "spearman_rho": float(rho) if not np.isnan(rho) else None,
91
+ "p_value": float(p) if not np.isnan(p) else None,
92
+ "significant": is_significant,
93
+ }
94
+ except Exception as e:
95
+ logger.warning("Spearman test failed for %s: %s", col, e)
96
+ predictive[col] = {"error": str(e)}
97
+ except ImportError:
98
+ logger.warning("scipy not installed, skipping Spearman tests")
99
+ predictive = {"error": "scipy not installed"}
100
+
101
+ return predictive
102
+
103
+
104
+ def _run_mutual_information(
105
+ data: pd.DataFrame,
106
+ present_cols: list[str],
107
+ target_col: str,
108
+ ) -> dict:
109
+ """Compute mutual information scores."""
110
+ mutual_info: dict = {}
111
+ if target_col not in data.columns:
112
+ return mutual_info
113
+
114
+ try:
115
+ from sklearn.feature_selection import mutual_info_regression
116
+
117
+ feature_data = data[present_cols].dropna()
118
+ target_data = data.loc[feature_data.index, target_col].dropna()
119
+ common_idx = feature_data.index.intersection(target_data.index)
120
+
121
+ if len(common_idx) > MIN_SAMPLES_FOR_SPEARMAN:
122
+ feature_data = feature_data.loc[common_idx]
123
+ target_data = target_data.loc[common_idx]
124
+ # Handle any remaining NaN after alignment
125
+ valid_mask = ~(feature_data.isna().any(axis=1) | target_data.isna())
126
+ feature_data = feature_data[valid_mask]
127
+ target_data = target_data[valid_mask]
128
+
129
+ if len(feature_data) > MIN_SAMPLES_FOR_MI:
130
+ mi_scores = mutual_info_regression(
131
+ feature_data, target_data, random_state=42
132
+ )
133
+ mutual_info = dict(
134
+ zip(present_cols, [float(s) for s in mi_scores], strict=False)
135
+ )
136
+ except ImportError:
137
+ logger.warning("sklearn not installed, skipping mutual information")
138
+ mutual_info = {"error": "sklearn not installed"}
139
+ except Exception as e:
140
+ logger.warning("Mutual information failed: %s", e)
141
+ mutual_info = {"error": str(e)}
142
+
143
+ return mutual_info
144
+
145
+
146
+ def _find_high_correlation_pairs(
147
+ data: pd.DataFrame,
148
+ present_cols: list[str],
149
+ ) -> list:
150
+ """Find highly correlated feature pairs."""
151
+ high_corr_pairs: list = []
152
+ try:
153
+ corr_matrix = data[present_cols].corr(method="spearman")
154
+ for i, col1 in enumerate(present_cols):
155
+ for col2 in present_cols[i + 1 :]:
156
+ if col1 in corr_matrix.columns and col2 in corr_matrix.columns:
157
+ corr_val = corr_matrix.loc[col1, col2]
158
+ if (
159
+ not np.isnan(corr_val)
160
+ and abs(corr_val) > HIGH_CORRELATION_THRESHOLD
161
+ ):
162
+ high_corr_pairs.append((col1, col2, float(corr_val)))
163
+ except Exception as e:
164
+ logger.warning("Correlation matrix failed: %s", e)
165
+
166
+ return high_corr_pairs
167
+
168
+
169
+ def validate_tier2(
170
+ df: pd.DataFrame,
171
+ target_col: str = "forward_return",
172
+ ) -> dict:
173
+ """Statistical validation (~10 min). MANDATORY before ML training.
174
+
175
+ Extends Tier 1 with statistical tests:
176
+ - Stationarity (Augmented Dickey-Fuller test)
177
+ - Predictive power (Spearman correlation with forward returns)
178
+ - Mutual information scores
179
+ - Feature correlation matrix (redundancy detection)
180
+
181
+ Parameters
182
+ ----------
183
+ df : pd.DataFrame
184
+ Range bar DataFrame with microstructure columns.
185
+ Should include forward_return column or Close for computing returns.
186
+ target_col : str, default="forward_return"
187
+ Column name for the prediction target.
188
+ If not present, will be computed from Close if available.
189
+
190
+ Returns
191
+ -------
192
+ dict
193
+ Validation results including:
194
+ - All Tier 1 results
195
+ - stationarity: dict[str, dict] - ADF results per feature
196
+ - predictive_power: dict[str, dict] - Spearman correlation results
197
+ - mutual_info: dict[str, float] - MI scores (if sklearn available)
198
+ - high_correlation_pairs: list[tuple] - Highly correlated feature pairs
199
+ - tier2_passed: bool - Tier 2 validation passed
200
+
201
+ Examples
202
+ --------
203
+ >>> from rangebar import get_range_bars
204
+ >>> from rangebar.validation.tier2 import validate_tier2
205
+ >>> df = get_range_bars("BTCUSDT", "2024-01-01", "2024-01-07")
206
+ >>> df["forward_return"] = df["Close"].shift(-1) / df["Close"] - 1
207
+ >>> result = validate_tier2(df)
208
+ >>> print("Tier 2:", "PASSED" if result["tier2_passed"] else "FAILED")
209
+ """
210
+ # Start with Tier 1 results
211
+ results = validate_tier1(df)
212
+
213
+ if not results.get("tier1_passed", False):
214
+ results["tier2_passed"] = False
215
+ return results
216
+
217
+ # Check which columns are present
218
+ present_cols = [c for c in FEATURE_COLS if c in df.columns]
219
+
220
+ if not present_cols:
221
+ results["tier2_passed"] = False
222
+ return results
223
+
224
+ # Compute forward return if not present
225
+ working_df = df
226
+ if target_col not in df.columns and "Close" in df.columns:
227
+ working_df = df.copy()
228
+ working_df[target_col] = working_df["Close"].shift(-1) / working_df["Close"] - 1
229
+
230
+ # 1. Stationarity tests (ADF)
231
+ results["stationarity"] = _run_stationarity_tests(working_df, present_cols)
232
+
233
+ # 2. Predictive power (Spearman with forward returns)
234
+ predictive = _run_predictive_power_tests(working_df, present_cols, target_col)
235
+ results["predictive_power"] = predictive
236
+
237
+ # 3. Mutual information
238
+ results["mutual_info"] = _run_mutual_information(
239
+ working_df, present_cols, target_col
240
+ )
241
+
242
+ # 4. Correlation matrix (check redundancy)
243
+ results["high_correlation_pairs"] = _find_high_correlation_pairs(
244
+ working_df, present_cols
245
+ )
246
+
247
+ # 5. Tier 2 pass criteria
248
+ # At least 3 features should have significant predictive power
249
+ significant_features = sum(
250
+ 1
251
+ for v in predictive.values()
252
+ if isinstance(v, dict) and v.get("significant", False)
253
+ )
254
+ results["significant_feature_count"] = significant_features
255
+
256
+ results["tier2_passed"] = (
257
+ results.get("tier1_passed", False)
258
+ and significant_features >= MIN_SIGNIFICANT_FEATURES
259
+ )
260
+
261
+ return results