@vizzor/cli 0.13.1 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +251 -192
- package/chronovisor-engine/pyproject.toml +31 -0
- package/chronovisor-engine/src/__init__.py +0 -0
- package/chronovisor-engine/src/inference/__init__.py +0 -0
- package/chronovisor-engine/src/inference/predict.py +44 -0
- package/chronovisor-engine/src/model_catalog.py +219 -0
- package/chronovisor-engine/src/models/__init__.py +0 -0
- package/chronovisor-engine/src/models/anomaly_detector.py +104 -0
- package/chronovisor-engine/src/models/blockchain_cycle_analyzer.py +217 -0
- package/chronovisor-engine/src/models/catalyst_event_model.py +70 -0
- package/chronovisor-engine/src/models/conformal_interval.py +50 -0
- package/chronovisor-engine/src/models/divergence_detector.py +247 -0
- package/chronovisor-engine/src/models/drift_monitor.py +51 -0
- package/chronovisor-engine/src/models/intent_classifier.py +189 -0
- package/chronovisor-engine/src/models/lstm_predictor.py +143 -0
- package/chronovisor-engine/src/models/microstructure_specialist.py +65 -0
- package/chronovisor-engine/src/models/narrative_detector.py +418 -0
- package/chronovisor-engine/src/models/portfolio_optimizer.py +162 -0
- package/chronovisor-engine/src/models/project_risk_scorer.py +184 -0
- package/chronovisor-engine/src/models/pump_detector.py +344 -0
- package/chronovisor-engine/src/models/regime_detector.py +127 -0
- package/chronovisor-engine/src/models/rug_detector.py +197 -0
- package/chronovisor-engine/src/models/sentiment_analyzer.py +257 -0
- package/chronovisor-engine/src/models/signal_classifier.py +191 -0
- package/chronovisor-engine/src/models/stacking_meta.py +56 -0
- package/chronovisor-engine/src/models/strategy_bandit.py +191 -0
- package/chronovisor-engine/src/models/ta_interpreter.py +341 -0
- package/chronovisor-engine/src/models/target_quantile.py +96 -0
- package/chronovisor-engine/src/models/trend_scorer.py +107 -0
- package/chronovisor-engine/src/models/wallet_classifier.py +261 -0
- package/chronovisor-engine/src/server.py +1686 -0
- package/chronovisor-engine/src/training/__init__.py +0 -0
- package/chronovisor-engine/src/training/data_loader.py +635 -0
- package/chronovisor-engine/src/training/pipeline.py +130 -0
- package/chronovisor-engine/src/training/train_catalyst.py +169 -0
- package/chronovisor-engine/src/training/train_classifier.py +159 -0
- package/chronovisor-engine/src/training/train_conformal.py +106 -0
- package/chronovisor-engine/src/training/train_direction.py +215 -0
- package/chronovisor-engine/src/training/train_drift.py +57 -0
- package/chronovisor-engine/src/training/train_isotonic.py +58 -0
- package/chronovisor-engine/src/training/train_lstm.py +217 -0
- package/chronovisor-engine/src/training/train_microstructure.py +102 -0
- package/chronovisor-engine/src/training/train_narrative.py +168 -0
- package/chronovisor-engine/src/training/train_pump.py +109 -0
- package/chronovisor-engine/src/training/train_regime.py +116 -0
- package/chronovisor-engine/src/training/train_rug.py +58 -0
- package/chronovisor-engine/src/training/train_sentiment.py +63 -0
- package/chronovisor-engine/src/training/train_stacking_meta.py +74 -0
- package/chronovisor-engine/src/training/train_target_quantile.py +115 -0
- package/chronovisor-engine/src/training/train_trend.py +101 -0
- package/dist/index.js +22494 -15023
- package/dist/index.js.map +1 -1
- package/package.json +5 -1
- package/vizzor_logodarkicon.png +0 -0
- package/vizzor_logoicon.png +0 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""XGBoost direction classifier with walk-forward temporal validation.
|
|
2
|
+
|
|
3
|
+
Trains horizon-specific models (scalp/standard/position) to predict
|
|
4
|
+
price direction (up/down/sideways) from technical + on-chain features.
|
|
5
|
+
Uses proper time-based splits to avoid look-ahead bias.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
14
|
+
|
|
15
|
+
# Feature keys matching FeatureVector from Node.js side + engineered features
|
|
16
|
+
BASE_FEATURE_KEYS = [
|
|
17
|
+
"rsi", "macdHistogram", "bollingerPercentB", "ema12", "ema26",
|
|
18
|
+
"atr", "obv", "fundingRate", "fearGreed", "priceChange24h",
|
|
19
|
+
"rsiSlope", "volumeRatio", "emaCrossoverPct", "atrPct",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
# Engineered features added during preprocessing
|
|
23
|
+
ENGINEERED_KEYS = [
|
|
24
|
+
"lagReturn1", "lagReturn3", "lagReturn5",
|
|
25
|
+
"rollingVolatility5", "rollingVolatility10",
|
|
26
|
+
"priceVsSMA20",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
ALL_FEATURE_KEYS = BASE_FEATURE_KEYS + ENGINEERED_KEYS
|
|
30
|
+
|
|
31
|
+
# Horizon profiles and their direction thresholds (% move to label up/down)
|
|
32
|
+
HORIZON_PROFILES = {
|
|
33
|
+
"scalp": {"horizons": ["5m", "15m", "30m"], "threshold": 0.15},
|
|
34
|
+
"standard": {"horizons": ["1h", "4h"], "threshold": 0.35},
|
|
35
|
+
"position": {"horizons": ["1d", "7d"], "threshold": 0.75},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def engineer_features(features: dict, history: list[dict] | None = None) -> dict:
|
|
40
|
+
"""Add lagged returns, rolling volatility, price vs SMA to raw features."""
|
|
41
|
+
result = dict(features)
|
|
42
|
+
|
|
43
|
+
if history and len(history) >= 5:
|
|
44
|
+
prices = [h.get("price", 0) for h in history[-20:]]
|
|
45
|
+
if len(prices) >= 2 and prices[-1] > 0:
|
|
46
|
+
result["lagReturn1"] = (prices[-1] - prices[-2]) / prices[-1] * 100 if prices[-2] > 0 else 0
|
|
47
|
+
else:
|
|
48
|
+
result["lagReturn1"] = 0
|
|
49
|
+
|
|
50
|
+
if len(prices) >= 4 and prices[-1] > 0:
|
|
51
|
+
result["lagReturn3"] = (prices[-1] - prices[-4]) / prices[-1] * 100 if prices[-4] > 0 else 0
|
|
52
|
+
else:
|
|
53
|
+
result["lagReturn3"] = 0
|
|
54
|
+
|
|
55
|
+
if len(prices) >= 6 and prices[-1] > 0:
|
|
56
|
+
result["lagReturn5"] = (prices[-1] - prices[-6]) / prices[-1] * 100 if prices[-6] > 0 else 0
|
|
57
|
+
else:
|
|
58
|
+
result["lagReturn5"] = 0
|
|
59
|
+
|
|
60
|
+
if len(prices) >= 5:
|
|
61
|
+
returns = [
|
|
62
|
+
(prices[i] - prices[i - 1]) / prices[i - 1] * 100
|
|
63
|
+
for i in range(1, len(prices))
|
|
64
|
+
if prices[i - 1] > 0
|
|
65
|
+
]
|
|
66
|
+
result["rollingVolatility5"] = float(np.std(returns[-5:])) if len(returns) >= 5 else 0
|
|
67
|
+
result["rollingVolatility10"] = float(np.std(returns[-10:])) if len(returns) >= 10 else 0
|
|
68
|
+
else:
|
|
69
|
+
result["rollingVolatility5"] = 0
|
|
70
|
+
result["rollingVolatility10"] = 0
|
|
71
|
+
|
|
72
|
+
if len(prices) >= 20:
|
|
73
|
+
sma20 = np.mean(prices[-20:])
|
|
74
|
+
result["priceVsSMA20"] = (prices[-1] - sma20) / sma20 * 100 if sma20 > 0 else 0
|
|
75
|
+
else:
|
|
76
|
+
result["priceVsSMA20"] = 0
|
|
77
|
+
else:
|
|
78
|
+
for key in ENGINEERED_KEYS:
|
|
79
|
+
result.setdefault(key, 0)
|
|
80
|
+
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DirectionTrainer:
|
|
85
|
+
"""Walk-forward XGBoost trainer for direction prediction."""
|
|
86
|
+
|
|
87
|
+
def __init__(self, profile: str = "standard"):
|
|
88
|
+
if profile not in HORIZON_PROFILES:
|
|
89
|
+
raise ValueError(f"Unknown profile: {profile}. Use: {list(HORIZON_PROFILES.keys())}")
|
|
90
|
+
self.profile = profile
|
|
91
|
+
self.threshold = HORIZON_PROFILES[profile]["threshold"]
|
|
92
|
+
|
|
93
|
+
def label_direction(self, change_pct: float) -> int:
|
|
94
|
+
"""Convert price change to direction label: 0=down, 1=sideways, 2=up."""
|
|
95
|
+
if change_pct > self.threshold:
|
|
96
|
+
return 2 # up
|
|
97
|
+
elif change_pct < -self.threshold:
|
|
98
|
+
return 0 # down
|
|
99
|
+
return 1 # sideways
|
|
100
|
+
|
|
101
|
+
def run(self, outcomes: list[dict] | None = None) -> dict:
|
|
102
|
+
"""Train XGBoost model from prediction outcomes.
|
|
103
|
+
|
|
104
|
+
Each outcome should have:
|
|
105
|
+
- features: dict of feature values
|
|
106
|
+
- changePct: actual price change percentage
|
|
107
|
+
- horizon: string horizon label
|
|
108
|
+
|
|
109
|
+
Returns training result dict.
|
|
110
|
+
"""
|
|
111
|
+
import time
|
|
112
|
+
start = time.time()
|
|
113
|
+
|
|
114
|
+
if not outcomes or len(outcomes) < 30:
|
|
115
|
+
return {
|
|
116
|
+
"model": f"xgb-direction-{self.profile}",
|
|
117
|
+
"status": "skipped",
|
|
118
|
+
"metrics": None,
|
|
119
|
+
"duration_seconds": time.time() - start,
|
|
120
|
+
"artifact_path": "",
|
|
121
|
+
"error": f"Need at least 30 samples, got {len(outcomes) if outcomes else 0}",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
import xgboost as xgb
|
|
126
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
127
|
+
except ImportError:
|
|
128
|
+
return {
|
|
129
|
+
"model": f"xgb-direction-{self.profile}",
|
|
130
|
+
"status": "failed",
|
|
131
|
+
"metrics": None,
|
|
132
|
+
"duration_seconds": time.time() - start,
|
|
133
|
+
"artifact_path": "",
|
|
134
|
+
"error": "xgboost not installed",
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# Filter outcomes for this profile's horizons
|
|
138
|
+
valid_horizons = set(HORIZON_PROFILES[self.profile]["horizons"])
|
|
139
|
+
filtered = [o for o in outcomes if o.get("horizon") in valid_horizons]
|
|
140
|
+
if len(filtered) < 30:
|
|
141
|
+
filtered = outcomes # Fall back to all outcomes if not enough per-profile
|
|
142
|
+
|
|
143
|
+
# Build feature matrix and labels
|
|
144
|
+
X = np.array([
|
|
145
|
+
[o.get("features", {}).get(k, 0) for k in ALL_FEATURE_KEYS]
|
|
146
|
+
for o in filtered
|
|
147
|
+
])
|
|
148
|
+
y = np.array([self.label_direction(o.get("changePct", 0)) for o in filtered])
|
|
149
|
+
|
|
150
|
+
# Walk-forward temporal split: 70% train, 15% val, 15% test
|
|
151
|
+
n = len(X)
|
|
152
|
+
train_end = int(n * 0.70)
|
|
153
|
+
val_end = int(n * 0.85)
|
|
154
|
+
|
|
155
|
+
X_train, y_train = X[:train_end], y[:train_end]
|
|
156
|
+
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
|
|
157
|
+
X_test, y_test = X[val_end:], y[val_end:]
|
|
158
|
+
|
|
159
|
+
if len(X_test) < 5:
|
|
160
|
+
return {
|
|
161
|
+
"model": f"xgb-direction-{self.profile}",
|
|
162
|
+
"status": "skipped",
|
|
163
|
+
"metrics": None,
|
|
164
|
+
"duration_seconds": time.time() - start,
|
|
165
|
+
"artifact_path": "",
|
|
166
|
+
"error": "Not enough data for temporal split",
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Train XGBoost with early stopping
|
|
170
|
+
model = xgb.XGBClassifier(
|
|
171
|
+
n_estimators=200,
|
|
172
|
+
max_depth=4,
|
|
173
|
+
learning_rate=0.05,
|
|
174
|
+
subsample=0.8,
|
|
175
|
+
colsample_bytree=0.8,
|
|
176
|
+
min_child_weight=3,
|
|
177
|
+
reg_alpha=0.1,
|
|
178
|
+
reg_lambda=1.0,
|
|
179
|
+
objective="multi:softprob",
|
|
180
|
+
num_class=3,
|
|
181
|
+
eval_metric="mlogloss",
|
|
182
|
+
early_stopping_rounds=20,
|
|
183
|
+
random_state=42,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
model.fit(
|
|
187
|
+
X_train, y_train,
|
|
188
|
+
eval_set=[(X_val, y_val)],
|
|
189
|
+
verbose=False,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Evaluate on test set
|
|
193
|
+
y_pred = model.predict(X_test)
|
|
194
|
+
accuracy = float(accuracy_score(y_test, y_pred))
|
|
195
|
+
|
|
196
|
+
# Save model
|
|
197
|
+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
|
198
|
+
model_path = MODEL_DIR / f"xgb_direction_{self.profile}.json"
|
|
199
|
+
model.save_model(str(model_path))
|
|
200
|
+
|
|
201
|
+
duration = time.time() - start
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
"model": f"xgb-direction-{self.profile}",
|
|
205
|
+
"status": "success",
|
|
206
|
+
"metrics": {
|
|
207
|
+
"accuracy": round(accuracy, 4),
|
|
208
|
+
"train_samples": len(X_train),
|
|
209
|
+
"val_samples": len(X_val),
|
|
210
|
+
"test_samples": len(X_test),
|
|
211
|
+
"best_iteration": model.best_iteration if hasattr(model, "best_iteration") else None,
|
|
212
|
+
},
|
|
213
|
+
"duration_seconds": round(duration, 2),
|
|
214
|
+
"artifact_path": str(model_path),
|
|
215
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Baseline feature-distribution trainer for drift monitoring."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
|
|
8
|
+
from .data_loader import load_meta_prediction_frame
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DriftMonitorTrainer:
|
|
12
|
+
model_name = "meta_drift_detector"
|
|
13
|
+
artifact_path = Path("models") / "meta_drift_detector.joblib"
|
|
14
|
+
|
|
15
|
+
def run(self, days: int = 180) -> dict:
|
|
16
|
+
start = time.time()
|
|
17
|
+
frame = load_meta_prediction_frame(days)
|
|
18
|
+
if frame.empty or len(frame) < 50:
|
|
19
|
+
return {
|
|
20
|
+
"model": self.model_name,
|
|
21
|
+
"status": "skipped",
|
|
22
|
+
"metrics": {"samples": len(frame)},
|
|
23
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
24
|
+
"artifact_path": "",
|
|
25
|
+
"error": "Need at least 50 resolved predictions",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
numeric = frame.drop(columns=["model", "horizon", "was_correct"]).fillna(0.0)
|
|
29
|
+
baseline = {
|
|
30
|
+
column: {
|
|
31
|
+
"mean": float(numeric[column].mean()),
|
|
32
|
+
"std": float(max(numeric[column].std(), 1e-6)),
|
|
33
|
+
"p10": float(numeric[column].quantile(0.10)),
|
|
34
|
+
"p90": float(numeric[column].quantile(0.90)),
|
|
35
|
+
}
|
|
36
|
+
for column in numeric.columns
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
self.artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
joblib.dump(
|
|
41
|
+
{
|
|
42
|
+
"baseline": baseline,
|
|
43
|
+
"trained_at": str(int(time.time())),
|
|
44
|
+
},
|
|
45
|
+
self.artifact_path,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return {
|
|
49
|
+
"model": self.model_name,
|
|
50
|
+
"status": "success",
|
|
51
|
+
"metrics": {
|
|
52
|
+
"features": len(baseline),
|
|
53
|
+
"samples": len(numeric),
|
|
54
|
+
},
|
|
55
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
56
|
+
"artifact_path": str(self.artifact_path),
|
|
57
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Isotonic probability calibration trainer from resolved prediction history."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.isotonic import IsotonicRegression
|
|
9
|
+
|
|
10
|
+
from .data_loader import load_meta_prediction_frame
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IsotonicTrainer:
|
|
14
|
+
model_name = "meta_isotonic"
|
|
15
|
+
artifact_path = Path("models") / "meta_isotonic.joblib"
|
|
16
|
+
|
|
17
|
+
def run(self, days: int = 180) -> dict:
|
|
18
|
+
start = time.time()
|
|
19
|
+
frame = load_meta_prediction_frame(days)
|
|
20
|
+
if frame.empty or len(frame) < 50:
|
|
21
|
+
return {
|
|
22
|
+
"model": self.model_name,
|
|
23
|
+
"status": "skipped",
|
|
24
|
+
"metrics": {"samples": len(frame)},
|
|
25
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
26
|
+
"artifact_path": "",
|
|
27
|
+
"error": "Need at least 50 resolved predictions",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
predicted = frame["probability"].astype(float).to_numpy()
|
|
31
|
+
actual = frame["was_correct"].astype(float).to_numpy()
|
|
32
|
+
|
|
33
|
+
model = IsotonicRegression(y_min=0.05, y_max=0.95, out_of_bounds="clip")
|
|
34
|
+
model.fit(predicted, actual)
|
|
35
|
+
calibrated = model.predict(predicted)
|
|
36
|
+
mae = float(np.mean(np.abs(calibrated - actual)))
|
|
37
|
+
|
|
38
|
+
self.artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
39
|
+
joblib.dump(
|
|
40
|
+
{
|
|
41
|
+
"model": model,
|
|
42
|
+
"trained_at": str(int(time.time())),
|
|
43
|
+
"accuracy": 1.0 - mae,
|
|
44
|
+
"samples": len(predicted),
|
|
45
|
+
},
|
|
46
|
+
self.artifact_path,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
"model": self.model_name,
|
|
51
|
+
"status": "success",
|
|
52
|
+
"metrics": {
|
|
53
|
+
"mae": mae,
|
|
54
|
+
"samples": len(predicted),
|
|
55
|
+
},
|
|
56
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
57
|
+
"artifact_path": str(self.artifact_path),
|
|
58
|
+
}
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Training script for the LSTM price direction predictor.
|
|
2
|
+
|
|
3
|
+
Updated to produce proper (batch, T=100, F=8) shaped input instead of
|
|
4
|
+
the incorrect (batch, 1, N) that treated all features as a single timestep.
|
|
5
|
+
|
|
6
|
+
Per-candle features (8): close, rsi, macdHist, bbPercentB, atr, obv, funding, fearGreed
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import torch
|
|
14
|
+
import torch.nn as nn
|
|
15
|
+
from torch.utils.data import DataLoader, TensorDataset
|
|
16
|
+
|
|
17
|
+
from .data_loader import create_sequences, get_training_symbols, load_ohlcv
|
|
18
|
+
|
|
19
|
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
20
|
+
|
|
21
|
+
# Per-candle features: matches lstm_predictor.py PER_CANDLE_FEATURES
|
|
22
|
+
NUM_FEATURES = 8
|
|
23
|
+
SEQUENCE_LENGTH = 100
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PriceLSTM(nn.Module):
|
|
27
|
+
"""LSTM model for price direction classification.
|
|
28
|
+
|
|
29
|
+
Input shape: (batch, T=100, F=8)
|
|
30
|
+
Output: 3-class logits (down, sideways, up)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self, input_size: int = NUM_FEATURES, hidden_size: int = 64, num_layers: int = 2):
|
|
34
|
+
super().__init__()
|
|
35
|
+
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=0.2)
|
|
36
|
+
self.fc = nn.Sequential(
|
|
37
|
+
nn.Linear(hidden_size, 32),
|
|
38
|
+
nn.ReLU(),
|
|
39
|
+
nn.Dropout(0.2),
|
|
40
|
+
nn.Linear(32, 3), # 3 classes: down, sideways, up
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
def forward(self, x):
|
|
44
|
+
lstm_out, _ = self.lstm(x)
|
|
45
|
+
last_hidden = lstm_out[:, -1, :]
|
|
46
|
+
return self.fc(last_hidden)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def create_enriched_sequences(
|
|
50
|
+
df, window: int = SEQUENCE_LENGTH, horizon: int = 4
|
|
51
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
52
|
+
"""Create training sequences with per-candle TA features.
|
|
53
|
+
|
|
54
|
+
Instead of just OHLCV (5 features), each candle gets 8 features:
|
|
55
|
+
close, rsi, macdHist, bbPercentB, atr, obv, funding, fearGreed.
|
|
56
|
+
|
|
57
|
+
Falls back to create_sequences() if TA columns are missing (legacy data).
|
|
58
|
+
"""
|
|
59
|
+
required_cols = ["close", "rsi", "macd_hist", "bb_percent_b", "atr", "obv"]
|
|
60
|
+
has_ta = all(col in df.columns for col in required_cols)
|
|
61
|
+
|
|
62
|
+
if not has_ta:
|
|
63
|
+
# Legacy path: use raw OHLCV from data_loader
|
|
64
|
+
return create_sequences(df, window, horizon)
|
|
65
|
+
|
|
66
|
+
feature_cols = [
|
|
67
|
+
"close", "rsi", "macd_hist", "bb_percent_b", "atr", "obv",
|
|
68
|
+
"funding_rate", "fear_greed",
|
|
69
|
+
]
|
|
70
|
+
# Fill missing optional columns with defaults
|
|
71
|
+
if "funding_rate" not in df.columns:
|
|
72
|
+
df = df.copy()
|
|
73
|
+
df["funding_rate"] = 0.0
|
|
74
|
+
if "fear_greed" not in df.columns:
|
|
75
|
+
df = df.copy()
|
|
76
|
+
df["fear_greed"] = 50.0
|
|
77
|
+
|
|
78
|
+
data = df[feature_cols].values
|
|
79
|
+
closes = df["close"].values
|
|
80
|
+
|
|
81
|
+
X, y = [], []
|
|
82
|
+
for i in range(window, len(data) - horizon):
|
|
83
|
+
window_data = data[i - window: i].copy()
|
|
84
|
+
|
|
85
|
+
# Normalize close prices by first close in window
|
|
86
|
+
base_price = window_data[0, 0]
|
|
87
|
+
if base_price == 0:
|
|
88
|
+
continue
|
|
89
|
+
window_data[:, 0] /= base_price
|
|
90
|
+
|
|
91
|
+
X.append(window_data)
|
|
92
|
+
|
|
93
|
+
# Label from future price change
|
|
94
|
+
future_close = closes[i + horizon - 1]
|
|
95
|
+
current_close = closes[i - 1]
|
|
96
|
+
pct_change = (future_close - current_close) / current_close * 100
|
|
97
|
+
|
|
98
|
+
if pct_change > 1.0:
|
|
99
|
+
y.append(2) # up
|
|
100
|
+
elif pct_change < -1.0:
|
|
101
|
+
y.append(0) # down
|
|
102
|
+
else:
|
|
103
|
+
y.append(1) # sideways
|
|
104
|
+
|
|
105
|
+
return np.array(X, dtype=np.float32), np.array(y, dtype=np.int64)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def fit_scaler(X: np.ndarray):
|
|
109
|
+
"""Fit a StandardScaler on training data and return it.
|
|
110
|
+
|
|
111
|
+
Reshapes (N, T, F) → (N*T, F) for fitting, then back.
|
|
112
|
+
"""
|
|
113
|
+
from sklearn.preprocessing import StandardScaler
|
|
114
|
+
|
|
115
|
+
n_samples, seq_len, n_features = X.shape
|
|
116
|
+
flat = X.reshape(-1, n_features)
|
|
117
|
+
scaler = StandardScaler()
|
|
118
|
+
scaler.fit(flat)
|
|
119
|
+
return scaler
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def apply_scaler(X: np.ndarray, scaler) -> np.ndarray:
|
|
123
|
+
"""Apply a fitted StandardScaler to sequence data."""
|
|
124
|
+
n_samples, seq_len, n_features = X.shape
|
|
125
|
+
flat = X.reshape(-1, n_features)
|
|
126
|
+
scaled = scaler.transform(flat)
|
|
127
|
+
return scaled.reshape(n_samples, seq_len, n_features).astype(np.float32)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def train(days: int = 90, epochs: int = 50, batch_size: int = 32, lr: float = 1e-3):
|
|
131
|
+
"""Train LSTM on historical OHLCV data from the available symbol universe."""
|
|
132
|
+
print(f"Loading data ({days} days)...")
|
|
133
|
+
|
|
134
|
+
all_X, all_y = [], []
|
|
135
|
+
for symbol in get_training_symbols():
|
|
136
|
+
try:
|
|
137
|
+
df = load_ohlcv(symbol, "4h", days)
|
|
138
|
+
if len(df) < 200:
|
|
139
|
+
print(f" {symbol}: insufficient data ({len(df)} rows), skipping")
|
|
140
|
+
continue
|
|
141
|
+
X, y = create_enriched_sequences(df, window=SEQUENCE_LENGTH, horizon=4)
|
|
142
|
+
all_X.append(X)
|
|
143
|
+
all_y.append(y)
|
|
144
|
+
print(f" {symbol}: {len(X)} sequences, shape {X.shape}")
|
|
145
|
+
except Exception as e:
|
|
146
|
+
print(f" {symbol}: error — {e}")
|
|
147
|
+
|
|
148
|
+
if not all_X:
|
|
149
|
+
print("No training data available. Ensure data collector has run.")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
X = np.concatenate(all_X)
|
|
153
|
+
y = np.concatenate(all_y)
|
|
154
|
+
print(f"Total: {len(X)} sequences (shape {X.shape}), class distribution: {np.bincount(y)}")
|
|
155
|
+
|
|
156
|
+
# Fit and save scaler for inference normalization
|
|
157
|
+
scaler = fit_scaler(X)
|
|
158
|
+
X = apply_scaler(X, scaler)
|
|
159
|
+
|
|
160
|
+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
|
161
|
+
import joblib
|
|
162
|
+
scaler_path = MODEL_DIR / "lstm_predictor_scaler.joblib"
|
|
163
|
+
joblib.dump(scaler, scaler_path)
|
|
164
|
+
print(f"Scaler saved to {scaler_path}")
|
|
165
|
+
|
|
166
|
+
# Split 80/20
|
|
167
|
+
split = int(len(X) * 0.8)
|
|
168
|
+
X_train, X_val = X[:split], X[split:]
|
|
169
|
+
y_train, y_val = y[:split], y[split:]
|
|
170
|
+
|
|
171
|
+
train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
|
|
172
|
+
val_ds = TensorDataset(torch.tensor(X_val), torch.tensor(y_val))
|
|
173
|
+
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
|
174
|
+
val_dl = DataLoader(val_ds, batch_size=batch_size)
|
|
175
|
+
|
|
176
|
+
model = PriceLSTM(input_size=NUM_FEATURES)
|
|
177
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
|
178
|
+
criterion = nn.CrossEntropyLoss()
|
|
179
|
+
|
|
180
|
+
best_val_acc = 0.0
|
|
181
|
+
for epoch in range(epochs):
|
|
182
|
+
model.train()
|
|
183
|
+
total_loss = 0
|
|
184
|
+
for X_batch, y_batch in train_dl:
|
|
185
|
+
optimizer.zero_grad()
|
|
186
|
+
output = model(X_batch)
|
|
187
|
+
loss = criterion(output, y_batch)
|
|
188
|
+
loss.backward()
|
|
189
|
+
optimizer.step()
|
|
190
|
+
total_loss += loss.item()
|
|
191
|
+
|
|
192
|
+
# Validation
|
|
193
|
+
model.eval()
|
|
194
|
+
correct, total = 0, 0
|
|
195
|
+
with torch.no_grad():
|
|
196
|
+
for X_batch, y_batch in val_dl:
|
|
197
|
+
output = model(X_batch)
|
|
198
|
+
preds = output.argmax(dim=-1)
|
|
199
|
+
correct += (preds == y_batch).sum().item()
|
|
200
|
+
total += len(y_batch)
|
|
201
|
+
|
|
202
|
+
val_acc = correct / max(1, total)
|
|
203
|
+
avg_loss = total_loss / len(train_dl)
|
|
204
|
+
|
|
205
|
+
if (epoch + 1) % 10 == 0:
|
|
206
|
+
print(f"Epoch {epoch + 1}/{epochs} — loss: {avg_loss:.4f}, val_acc: {val_acc:.3f}")
|
|
207
|
+
|
|
208
|
+
if val_acc > best_val_acc:
|
|
209
|
+
best_val_acc = val_acc
|
|
210
|
+
torch.save(model, MODEL_DIR / "lstm_predictor.pt")
|
|
211
|
+
|
|
212
|
+
print(f"Training complete. Best validation accuracy: {best_val_acc:.3f}")
|
|
213
|
+
print(f"Model saved to {MODEL_DIR / 'lstm_predictor.pt'}")
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
if __name__ == "__main__":
|
|
217
|
+
train()
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Short-horizon microstructure specialist trainer."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
import numpy as np
|
|
8
|
+
from sklearn.metrics import accuracy_score, f1_score
|
|
9
|
+
|
|
10
|
+
from .data_loader import load_microstructure_training_frame
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MicrostructureTrainer:
|
|
14
|
+
model_name = "microstructure_specialist"
|
|
15
|
+
artifact_path = Path("models") / "microstructure_specialist.joblib"
|
|
16
|
+
feature_keys = [
|
|
17
|
+
"return_1",
|
|
18
|
+
"volume_ratio",
|
|
19
|
+
"range_pct",
|
|
20
|
+
"wick_imbalance",
|
|
21
|
+
"trade_intensity",
|
|
22
|
+
"price_vs_sma20",
|
|
23
|
+
"volatility_5",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
def run(self, days: int = 30) -> dict:
|
|
27
|
+
start = time.time()
|
|
28
|
+
frame = load_microstructure_training_frame(days=days, timeframe="1m")
|
|
29
|
+
if frame.empty or len(frame) < 80:
|
|
30
|
+
return {
|
|
31
|
+
"model": self.model_name,
|
|
32
|
+
"status": "skipped",
|
|
33
|
+
"metrics": {"samples": len(frame)},
|
|
34
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
35
|
+
"artifact_path": "",
|
|
36
|
+
"error": "Need at least 80 microstructure samples",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
X = frame[self.feature_keys].fillna(0.0).astype(np.float32).values
|
|
40
|
+
y = frame["y"].astype(np.int64).values
|
|
41
|
+
n = len(X)
|
|
42
|
+
split = int(n * 0.80)
|
|
43
|
+
X_train, X_test = X[:split], X[split:]
|
|
44
|
+
y_train, y_test = y[:split], y[split:]
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
import xgboost as xgb
|
|
48
|
+
|
|
49
|
+
model = xgb.XGBClassifier(
|
|
50
|
+
n_estimators=220,
|
|
51
|
+
max_depth=4,
|
|
52
|
+
learning_rate=0.05,
|
|
53
|
+
subsample=0.85,
|
|
54
|
+
colsample_bytree=0.85,
|
|
55
|
+
objective="multi:softprob",
|
|
56
|
+
num_class=3,
|
|
57
|
+
eval_metric="mlogloss",
|
|
58
|
+
random_state=42,
|
|
59
|
+
)
|
|
60
|
+
model.fit(X_train, y_train)
|
|
61
|
+
engine = "xgboost"
|
|
62
|
+
except Exception:
|
|
63
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
64
|
+
|
|
65
|
+
model = RandomForestClassifier(
|
|
66
|
+
n_estimators=250,
|
|
67
|
+
max_depth=10,
|
|
68
|
+
min_samples_leaf=4,
|
|
69
|
+
class_weight="balanced_subsample",
|
|
70
|
+
random_state=42,
|
|
71
|
+
n_jobs=-1,
|
|
72
|
+
)
|
|
73
|
+
model.fit(X_train, y_train)
|
|
74
|
+
engine = "random_forest"
|
|
75
|
+
|
|
76
|
+
preds = model.predict(X_test)
|
|
77
|
+
accuracy = float(accuracy_score(y_test, preds))
|
|
78
|
+
f1 = float(f1_score(y_test, preds, average="weighted", zero_division=0))
|
|
79
|
+
|
|
80
|
+
self.artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
joblib.dump(
|
|
82
|
+
{
|
|
83
|
+
"model": model,
|
|
84
|
+
"engine": engine,
|
|
85
|
+
"feature_keys": self.feature_keys,
|
|
86
|
+
"trained_at": str(int(time.time())),
|
|
87
|
+
"accuracy": accuracy,
|
|
88
|
+
},
|
|
89
|
+
self.artifact_path,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return {
|
|
93
|
+
"model": self.model_name,
|
|
94
|
+
"status": "success",
|
|
95
|
+
"metrics": {
|
|
96
|
+
"accuracy": accuracy,
|
|
97
|
+
"f1": f1,
|
|
98
|
+
"test_samples": len(X_test),
|
|
99
|
+
},
|
|
100
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
101
|
+
"artifact_path": str(self.artifact_path),
|
|
102
|
+
}
|