@vizzor/cli 0.13.1 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +250 -192
- package/chronovisor-engine/pyproject.toml +31 -0
- package/chronovisor-engine/src/__init__.py +0 -0
- package/chronovisor-engine/src/inference/__init__.py +0 -0
- package/chronovisor-engine/src/inference/predict.py +44 -0
- package/chronovisor-engine/src/model_catalog.py +219 -0
- package/chronovisor-engine/src/models/__init__.py +0 -0
- package/chronovisor-engine/src/models/anomaly_detector.py +104 -0
- package/chronovisor-engine/src/models/blockchain_cycle_analyzer.py +217 -0
- package/chronovisor-engine/src/models/catalyst_event_model.py +70 -0
- package/chronovisor-engine/src/models/conformal_interval.py +50 -0
- package/chronovisor-engine/src/models/divergence_detector.py +247 -0
- package/chronovisor-engine/src/models/drift_monitor.py +51 -0
- package/chronovisor-engine/src/models/intent_classifier.py +189 -0
- package/chronovisor-engine/src/models/lstm_predictor.py +143 -0
- package/chronovisor-engine/src/models/microstructure_specialist.py +65 -0
- package/chronovisor-engine/src/models/narrative_detector.py +418 -0
- package/chronovisor-engine/src/models/portfolio_optimizer.py +162 -0
- package/chronovisor-engine/src/models/project_risk_scorer.py +184 -0
- package/chronovisor-engine/src/models/pump_detector.py +344 -0
- package/chronovisor-engine/src/models/regime_detector.py +127 -0
- package/chronovisor-engine/src/models/rug_detector.py +197 -0
- package/chronovisor-engine/src/models/sentiment_analyzer.py +257 -0
- package/chronovisor-engine/src/models/signal_classifier.py +191 -0
- package/chronovisor-engine/src/models/stacking_meta.py +56 -0
- package/chronovisor-engine/src/models/strategy_bandit.py +191 -0
- package/chronovisor-engine/src/models/ta_interpreter.py +341 -0
- package/chronovisor-engine/src/models/target_quantile.py +96 -0
- package/chronovisor-engine/src/models/trend_scorer.py +107 -0
- package/chronovisor-engine/src/models/wallet_classifier.py +261 -0
- package/chronovisor-engine/src/server.py +1686 -0
- package/chronovisor-engine/src/training/__init__.py +0 -0
- package/chronovisor-engine/src/training/data_loader.py +635 -0
- package/chronovisor-engine/src/training/pipeline.py +130 -0
- package/chronovisor-engine/src/training/train_catalyst.py +169 -0
- package/chronovisor-engine/src/training/train_classifier.py +159 -0
- package/chronovisor-engine/src/training/train_conformal.py +106 -0
- package/chronovisor-engine/src/training/train_direction.py +215 -0
- package/chronovisor-engine/src/training/train_drift.py +57 -0
- package/chronovisor-engine/src/training/train_isotonic.py +58 -0
- package/chronovisor-engine/src/training/train_lstm.py +217 -0
- package/chronovisor-engine/src/training/train_microstructure.py +102 -0
- package/chronovisor-engine/src/training/train_narrative.py +168 -0
- package/chronovisor-engine/src/training/train_pump.py +109 -0
- package/chronovisor-engine/src/training/train_regime.py +116 -0
- package/chronovisor-engine/src/training/train_rug.py +58 -0
- package/chronovisor-engine/src/training/train_sentiment.py +63 -0
- package/chronovisor-engine/src/training/train_stacking_meta.py +74 -0
- package/chronovisor-engine/src/training/train_target_quantile.py +115 -0
- package/chronovisor-engine/src/training/train_trend.py +101 -0
- package/dist/index.js +19124 -11698
- package/dist/index.js.map +1 -1
- package/package.json +3 -1
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Training Pipeline — pulls data, generates labels, trains models, saves artifacts.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
import time
|
|
8
|
+
import logging
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
ARTIFACT_DIR = Path(os.getenv("MODEL_ARTIFACT_DIR", "models"))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TrainingPipeline:
|
|
20
|
+
"""Base training pipeline for all ML models."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, model_name: str):
|
|
23
|
+
self.model_name = model_name
|
|
24
|
+
self.artifact_dir = ARTIFACT_DIR / model_name
|
|
25
|
+
self.artifact_dir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
|
|
27
|
+
def load_data(self) -> Any:
|
|
28
|
+
"""Override in subclass to load training data."""
|
|
29
|
+
raise NotImplementedError
|
|
30
|
+
|
|
31
|
+
def preprocess(self, data: Any) -> tuple:
|
|
32
|
+
"""Override to split and preprocess data. Returns (X_train, X_val, X_test, y_train, y_val, y_test)."""
|
|
33
|
+
raise NotImplementedError
|
|
34
|
+
|
|
35
|
+
def train(self, X_train: Any, y_train: Any, X_val: Any, y_val: Any) -> Any:
|
|
36
|
+
"""Override to train the model. Returns trained model."""
|
|
37
|
+
raise NotImplementedError
|
|
38
|
+
|
|
39
|
+
def evaluate(self, model: Any, X_test: Any, y_test: Any) -> dict:
|
|
40
|
+
"""Override to evaluate the model. Returns metrics dict."""
|
|
41
|
+
raise NotImplementedError
|
|
42
|
+
|
|
43
|
+
def fit_scaler(self, X: np.ndarray) -> Any:
|
|
44
|
+
"""Fit a StandardScaler on training features and save it.
|
|
45
|
+
|
|
46
|
+
Call this in preprocess() after splitting data, before training.
|
|
47
|
+
The scaler is saved as {model_name}_scaler.joblib for inference use.
|
|
48
|
+
"""
|
|
49
|
+
from sklearn.preprocessing import StandardScaler
|
|
50
|
+
import joblib
|
|
51
|
+
|
|
52
|
+
scaler = StandardScaler()
|
|
53
|
+
scaler.fit(X)
|
|
54
|
+
|
|
55
|
+
scaler_path = self.artifact_dir / f"{self.model_name}_scaler.joblib"
|
|
56
|
+
joblib.dump(scaler, scaler_path)
|
|
57
|
+
logger.info(f"Scaler saved to {scaler_path}")
|
|
58
|
+
|
|
59
|
+
self._scaler = scaler
|
|
60
|
+
return scaler
|
|
61
|
+
|
|
62
|
+
def transform_features(self, X: np.ndarray, scaler: Any = None) -> np.ndarray:
|
|
63
|
+
"""Apply a fitted StandardScaler to features.
|
|
64
|
+
|
|
65
|
+
Uses the internally stored scaler if none provided.
|
|
66
|
+
"""
|
|
67
|
+
s = scaler or getattr(self, "_scaler", None)
|
|
68
|
+
if s is None:
|
|
69
|
+
logger.warning("No scaler available — returning raw features")
|
|
70
|
+
return X
|
|
71
|
+
return s.transform(X).astype(np.float32)
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def load_scaler(model_name: str) -> Any:
|
|
75
|
+
"""Load a previously saved scaler for inference."""
|
|
76
|
+
import joblib
|
|
77
|
+
|
|
78
|
+
scaler_path = ARTIFACT_DIR / model_name / f"{model_name}_scaler.joblib"
|
|
79
|
+
if scaler_path.exists():
|
|
80
|
+
return joblib.load(scaler_path)
|
|
81
|
+
# Fallback: check flat models/ directory
|
|
82
|
+
flat_path = ARTIFACT_DIR / f"{model_name}_scaler.joblib"
|
|
83
|
+
if flat_path.exists():
|
|
84
|
+
return joblib.load(flat_path)
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def save(self, model: Any, metrics: dict) -> str:
|
|
88
|
+
"""Save model artifact and metrics."""
|
|
89
|
+
timestamp = int(time.time())
|
|
90
|
+
artifact_path = self.artifact_dir / f"{self.model_name}_{timestamp}.pkl"
|
|
91
|
+
|
|
92
|
+
# Save metrics
|
|
93
|
+
metrics_path = self.artifact_dir / f"metrics_{timestamp}.json"
|
|
94
|
+
with open(metrics_path, "w") as f:
|
|
95
|
+
json.dump(metrics, f, indent=2)
|
|
96
|
+
|
|
97
|
+
logger.info(f"Model saved to {artifact_path}")
|
|
98
|
+
return str(artifact_path)
|
|
99
|
+
|
|
100
|
+
def run(self) -> dict:
|
|
101
|
+
"""Execute the full training pipeline."""
|
|
102
|
+
start = time.time()
|
|
103
|
+
logger.info(f"Starting training pipeline for {self.model_name}")
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
data = self.load_data()
|
|
107
|
+
splits = self.preprocess(data)
|
|
108
|
+
X_train, X_val, X_test, y_train, y_val, y_test = splits
|
|
109
|
+
|
|
110
|
+
model = self.train(X_train, y_train, X_val, y_val)
|
|
111
|
+
metrics = self.evaluate(model, X_test, y_test)
|
|
112
|
+
artifact_path = self.save(model, metrics)
|
|
113
|
+
|
|
114
|
+
duration = time.time() - start
|
|
115
|
+
return {
|
|
116
|
+
"model": self.model_name,
|
|
117
|
+
"status": "success",
|
|
118
|
+
"metrics": metrics,
|
|
119
|
+
"duration_seconds": round(duration, 2),
|
|
120
|
+
"artifact_path": artifact_path,
|
|
121
|
+
}
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Training failed for {self.model_name}: {e}")
|
|
124
|
+
return {
|
|
125
|
+
"model": self.model_name,
|
|
126
|
+
"status": "failed",
|
|
127
|
+
"error": str(e),
|
|
128
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
129
|
+
"artifact_path": "",
|
|
130
|
+
}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Event-aware catalyst trainer for 1d+ direction forecasts."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from sklearn.metrics import accuracy_score, f1_score
|
|
10
|
+
|
|
11
|
+
from .data_loader import build_feature_frame, get_training_symbols
|
|
12
|
+
|
|
13
|
+
HIGH_IMPACT_EVENTS = [
|
|
14
|
+
("2025-12-17", "fomc"),
|
|
15
|
+
("2026-01-14", "cpi"),
|
|
16
|
+
("2026-01-28", "fomc"),
|
|
17
|
+
("2026-02-06", "nfp"),
|
|
18
|
+
("2026-02-11", "cpi"),
|
|
19
|
+
("2026-03-06", "nfp"),
|
|
20
|
+
("2026-03-18", "fomc"),
|
|
21
|
+
("2026-04-03", "nfp"),
|
|
22
|
+
("2026-05-06", "fomc"),
|
|
23
|
+
("2026-06-05", "nfp"),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _event_features(timestamp) -> dict[str, float]:
|
|
28
|
+
event_times = [
|
|
29
|
+
(abs((timestamp - pd.Timestamp(date, tz="UTC")) / np.timedelta64(1, "D")), kind)
|
|
30
|
+
for date, kind in HIGH_IMPACT_EVENTS
|
|
31
|
+
]
|
|
32
|
+
if not event_times:
|
|
33
|
+
return {
|
|
34
|
+
"days_to_event": 30.0,
|
|
35
|
+
"event_risk": 0.0,
|
|
36
|
+
"within_24h": 0.0,
|
|
37
|
+
"within_72h": 0.0,
|
|
38
|
+
"within_7d": 0.0,
|
|
39
|
+
"is_fomc": 0.0,
|
|
40
|
+
"is_cpi": 0.0,
|
|
41
|
+
"is_nfp": 0.0,
|
|
42
|
+
}
|
|
43
|
+
days_to_event, kind = sorted(event_times, key=lambda item: item[0])[0]
|
|
44
|
+
return {
|
|
45
|
+
"days_to_event": float(days_to_event),
|
|
46
|
+
"event_risk": float(max(0.0, 1.0 - min(days_to_event, 14.0) / 14.0)),
|
|
47
|
+
"within_24h": float(days_to_event <= 1.0),
|
|
48
|
+
"within_72h": float(days_to_event <= 3.0),
|
|
49
|
+
"within_7d": float(days_to_event <= 7.0),
|
|
50
|
+
"is_fomc": float(kind == "fomc"),
|
|
51
|
+
"is_cpi": float(kind == "cpi"),
|
|
52
|
+
"is_nfp": float(kind == "nfp"),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class CatalystTrainer:
|
|
57
|
+
model_name = "catalyst_event"
|
|
58
|
+
artifact_path = Path("models") / "catalyst_event.joblib"
|
|
59
|
+
feature_keys = [
|
|
60
|
+
"days_to_event",
|
|
61
|
+
"event_risk",
|
|
62
|
+
"within_24h",
|
|
63
|
+
"within_72h",
|
|
64
|
+
"within_7d",
|
|
65
|
+
"is_fomc",
|
|
66
|
+
"is_cpi",
|
|
67
|
+
"is_nfp",
|
|
68
|
+
"returns_1d",
|
|
69
|
+
"returns_7d",
|
|
70
|
+
"volatility_14d",
|
|
71
|
+
"fear_greed",
|
|
72
|
+
"funding_rate",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
def run(self, days: int = 365) -> dict:
|
|
76
|
+
start = time.time()
|
|
77
|
+
rows = []
|
|
78
|
+
for symbol in get_training_symbols():
|
|
79
|
+
frame = build_feature_frame(symbol, timeframe="4h", days=days)
|
|
80
|
+
if frame.empty or len(frame) < 120:
|
|
81
|
+
continue
|
|
82
|
+
future = frame["future_return_1d"]
|
|
83
|
+
labels = np.where(future > 1.0, 2, np.where(future < -1.0, 0, 1))
|
|
84
|
+
event_rows = frame["time"].apply(_event_features)
|
|
85
|
+
event_frame = pd.DataFrame(list(event_rows))
|
|
86
|
+
merged = pd.concat(
|
|
87
|
+
[
|
|
88
|
+
event_frame.reset_index(drop=True),
|
|
89
|
+
frame[["returns_1d", "returns_7d", "volatility_14d", "fear_greed", "funding_rate"]].reset_index(drop=True),
|
|
90
|
+
],
|
|
91
|
+
axis=1,
|
|
92
|
+
)
|
|
93
|
+
merged["y"] = labels
|
|
94
|
+
rows.append(merged)
|
|
95
|
+
|
|
96
|
+
if not rows:
|
|
97
|
+
return {
|
|
98
|
+
"model": self.model_name,
|
|
99
|
+
"status": "skipped",
|
|
100
|
+
"metrics": {"samples": 0},
|
|
101
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
102
|
+
"artifact_path": "",
|
|
103
|
+
"error": "No catalyst training data available",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
frame = pd.concat(rows, ignore_index=True).fillna(0.0)
|
|
107
|
+
X = frame[self.feature_keys].astype(np.float32).values
|
|
108
|
+
y = frame["y"].astype(np.int64).values
|
|
109
|
+
n = len(X)
|
|
110
|
+
split = int(n * 0.80)
|
|
111
|
+
X_train, X_test = X[:split], X[split:]
|
|
112
|
+
y_train, y_test = y[:split], y[split:]
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
import xgboost as xgb
|
|
116
|
+
|
|
117
|
+
model = xgb.XGBClassifier(
|
|
118
|
+
n_estimators=220,
|
|
119
|
+
max_depth=4,
|
|
120
|
+
learning_rate=0.05,
|
|
121
|
+
subsample=0.85,
|
|
122
|
+
colsample_bytree=0.85,
|
|
123
|
+
objective="multi:softprob",
|
|
124
|
+
num_class=3,
|
|
125
|
+
eval_metric="mlogloss",
|
|
126
|
+
random_state=42,
|
|
127
|
+
)
|
|
128
|
+
model.fit(X_train, y_train)
|
|
129
|
+
engine = "xgboost"
|
|
130
|
+
except Exception:
|
|
131
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
132
|
+
|
|
133
|
+
model = RandomForestClassifier(
|
|
134
|
+
n_estimators=250,
|
|
135
|
+
max_depth=10,
|
|
136
|
+
class_weight="balanced_subsample",
|
|
137
|
+
random_state=42,
|
|
138
|
+
n_jobs=-1,
|
|
139
|
+
)
|
|
140
|
+
model.fit(X_train, y_train)
|
|
141
|
+
engine = "random_forest"
|
|
142
|
+
|
|
143
|
+
preds = model.predict(X_test)
|
|
144
|
+
accuracy = float(accuracy_score(y_test, preds))
|
|
145
|
+
f1 = float(f1_score(y_test, preds, average="weighted", zero_division=0))
|
|
146
|
+
|
|
147
|
+
self.artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
148
|
+
joblib.dump(
|
|
149
|
+
{
|
|
150
|
+
"model": model,
|
|
151
|
+
"engine": engine,
|
|
152
|
+
"feature_keys": self.feature_keys,
|
|
153
|
+
"trained_at": str(int(time.time())),
|
|
154
|
+
"accuracy": accuracy,
|
|
155
|
+
},
|
|
156
|
+
self.artifact_path,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return {
|
|
160
|
+
"model": self.model_name,
|
|
161
|
+
"status": "success",
|
|
162
|
+
"metrics": {
|
|
163
|
+
"accuracy": accuracy,
|
|
164
|
+
"f1": f1,
|
|
165
|
+
"test_samples": len(X_test),
|
|
166
|
+
},
|
|
167
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
168
|
+
"artifact_path": str(self.artifact_path),
|
|
169
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Training script for the Random Forest signal classifier."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import joblib
|
|
8
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
9
|
+
from sklearn.model_selection import train_test_split
|
|
10
|
+
from sklearn.metrics import classification_report
|
|
11
|
+
|
|
12
|
+
from .data_loader import get_training_symbols, load_ohlcv
|
|
13
|
+
|
|
14
|
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
15
|
+
|
|
16
|
+
# Simple TA feature computation from raw OHLCV
|
|
17
|
+
def compute_features(df):
|
|
18
|
+
"""Compute TA features from OHLCV DataFrame for classifier training."""
|
|
19
|
+
closes = df["close"].values
|
|
20
|
+
highs = df["high"].values
|
|
21
|
+
lows = df["low"].values
|
|
22
|
+
volumes = df["volume"].values
|
|
23
|
+
|
|
24
|
+
features, labels = [], []
|
|
25
|
+
|
|
26
|
+
for i in range(50, len(closes) - 4):
|
|
27
|
+
window = closes[i - 50 : i]
|
|
28
|
+
|
|
29
|
+
# RSI(14)
|
|
30
|
+
deltas = np.diff(window[-15:])
|
|
31
|
+
gains = np.maximum(deltas, 0)
|
|
32
|
+
losses = np.abs(np.minimum(deltas, 0))
|
|
33
|
+
avg_gain = np.mean(gains) if len(gains) > 0 else 0
|
|
34
|
+
avg_loss = np.mean(losses) if len(losses) > 0 else 1e-10
|
|
35
|
+
rs = avg_gain / avg_loss
|
|
36
|
+
rsi = 100 - (100 / (1 + rs))
|
|
37
|
+
|
|
38
|
+
# EMA12, EMA26
|
|
39
|
+
ema12 = np.mean(window[-12:])
|
|
40
|
+
ema26 = np.mean(window[-26:])
|
|
41
|
+
|
|
42
|
+
# MACD histogram proxy
|
|
43
|
+
macd_hist = ema12 - ema26
|
|
44
|
+
|
|
45
|
+
# ATR(14)
|
|
46
|
+
atr_vals = []
|
|
47
|
+
for j in range(i - 14, i):
|
|
48
|
+
tr = max(
|
|
49
|
+
highs[j] - lows[j],
|
|
50
|
+
abs(highs[j] - closes[j - 1]),
|
|
51
|
+
abs(lows[j] - closes[j - 1]),
|
|
52
|
+
)
|
|
53
|
+
atr_vals.append(tr)
|
|
54
|
+
atr = np.mean(atr_vals)
|
|
55
|
+
|
|
56
|
+
# Volume ratio
|
|
57
|
+
vol_avg = np.mean(volumes[i - 20 : i])
|
|
58
|
+
vol_ratio = volumes[i] / vol_avg if vol_avg > 0 else 1
|
|
59
|
+
|
|
60
|
+
# Bollinger %B
|
|
61
|
+
sma20 = np.mean(window[-20:])
|
|
62
|
+
std20 = np.std(window[-20:])
|
|
63
|
+
upper = sma20 + 2 * std20
|
|
64
|
+
lower = sma20 - 2 * std20
|
|
65
|
+
bb_pctb = (closes[i] - lower) / (upper - lower) if (upper - lower) > 0 else 0.5
|
|
66
|
+
|
|
67
|
+
# Price as base
|
|
68
|
+
price = closes[i]
|
|
69
|
+
ema_cross_pct = ((ema12 - ema26) / price * 100) if price > 0 else 0
|
|
70
|
+
atr_pct = (atr / price * 100) if price > 0 else 0
|
|
71
|
+
|
|
72
|
+
# RSI slope (3 periods)
|
|
73
|
+
if i >= 53:
|
|
74
|
+
older_window = closes[i - 53 : i - 3]
|
|
75
|
+
older_deltas = np.diff(older_window[-15:])
|
|
76
|
+
older_gains = np.maximum(older_deltas, 0)
|
|
77
|
+
older_losses = np.abs(np.minimum(older_deltas, 0))
|
|
78
|
+
older_rs = np.mean(older_gains) / (np.mean(older_losses) + 1e-10)
|
|
79
|
+
older_rsi = 100 - (100 / (1 + older_rs))
|
|
80
|
+
rsi_slope = rsi - older_rsi
|
|
81
|
+
else:
|
|
82
|
+
rsi_slope = 0
|
|
83
|
+
|
|
84
|
+
feat = [
|
|
85
|
+
rsi, macd_hist, bb_pctb, ema12, ema26,
|
|
86
|
+
atr, 0, # OBV placeholder
|
|
87
|
+
0, # funding rate placeholder
|
|
88
|
+
50, # fear/greed placeholder
|
|
89
|
+
0, # price change 24h placeholder
|
|
90
|
+
rsi_slope, vol_ratio, ema_cross_pct, atr_pct,
|
|
91
|
+
]
|
|
92
|
+
features.append(feat)
|
|
93
|
+
|
|
94
|
+
# Label: 4-candle forward return
|
|
95
|
+
future_close = closes[i + 4]
|
|
96
|
+
pct = (future_close - price) / price * 100
|
|
97
|
+
if pct > 1.0:
|
|
98
|
+
labels.append("buy")
|
|
99
|
+
elif pct < -1.0:
|
|
100
|
+
labels.append("sell")
|
|
101
|
+
else:
|
|
102
|
+
labels.append("hold")
|
|
103
|
+
|
|
104
|
+
return np.array(features, dtype=np.float32), np.array(labels)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def train(days: int = 90):
|
|
108
|
+
"""Train Random Forest classifier on historical data."""
|
|
109
|
+
print(f"Loading data ({days} days)...")
|
|
110
|
+
|
|
111
|
+
all_X, all_y = [], []
|
|
112
|
+
for symbol in get_training_symbols():
|
|
113
|
+
try:
|
|
114
|
+
df = load_ohlcv(symbol, "4h", days)
|
|
115
|
+
if len(df) < 100:
|
|
116
|
+
print(f" {symbol}: insufficient data ({len(df)} rows), skipping")
|
|
117
|
+
continue
|
|
118
|
+
X, y = compute_features(df)
|
|
119
|
+
all_X.append(X)
|
|
120
|
+
all_y.append(y)
|
|
121
|
+
print(f" {symbol}: {len(X)} samples")
|
|
122
|
+
except Exception as e:
|
|
123
|
+
print(f" {symbol}: error — {e}")
|
|
124
|
+
|
|
125
|
+
if not all_X:
|
|
126
|
+
print("No training data available.")
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
X = np.concatenate(all_X)
|
|
130
|
+
y = np.concatenate(all_y)
|
|
131
|
+
unique, counts = np.unique(y, return_counts=True)
|
|
132
|
+
print(f"Total: {len(X)} samples, classes: {dict(zip(unique, counts))}")
|
|
133
|
+
|
|
134
|
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
|
135
|
+
|
|
136
|
+
clf = RandomForestClassifier(
|
|
137
|
+
n_estimators=200,
|
|
138
|
+
max_depth=12,
|
|
139
|
+
min_samples_split=10,
|
|
140
|
+
class_weight="balanced",
|
|
141
|
+
random_state=42,
|
|
142
|
+
n_jobs=-1,
|
|
143
|
+
)
|
|
144
|
+
clf.fit(X_train, y_train)
|
|
145
|
+
|
|
146
|
+
y_pred = clf.predict(X_test)
|
|
147
|
+
print("\nClassification Report:")
|
|
148
|
+
print(classification_report(y_test, y_pred))
|
|
149
|
+
|
|
150
|
+
accuracy = (y_pred == y_test).mean()
|
|
151
|
+
print(f"Test accuracy: {accuracy:.3f}")
|
|
152
|
+
|
|
153
|
+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
joblib.dump(clf, MODEL_DIR / "signal_classifier.joblib")
|
|
155
|
+
print(f"Model saved to {MODEL_DIR / 'signal_classifier.joblib'}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == "__main__":
|
|
159
|
+
train()
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Conformal interval trainer for target-delta forecasts."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import joblib
|
|
8
|
+
import numpy as np
|
|
9
|
+
from sklearn.ensemble import GradientBoostingRegressor
|
|
10
|
+
|
|
11
|
+
from .data_loader import load_target_outcomes
|
|
12
|
+
from .train_direction import ALL_FEATURE_KEYS
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _horizon_to_minutes(horizon: str) -> float:
|
|
16
|
+
total = 0.0
|
|
17
|
+
for amount, unit in re.findall(r"(\d+)(mo|y|w|d|h|m)", str(horizon or "").strip().lower()):
|
|
18
|
+
quantity = float(amount or 0)
|
|
19
|
+
if unit == "m":
|
|
20
|
+
total += quantity
|
|
21
|
+
elif unit == "h":
|
|
22
|
+
total += quantity * 60
|
|
23
|
+
elif unit == "d":
|
|
24
|
+
total += quantity * 1440
|
|
25
|
+
elif unit == "w":
|
|
26
|
+
total += quantity * 10080
|
|
27
|
+
elif unit == "mo":
|
|
28
|
+
total += quantity * 43200
|
|
29
|
+
elif unit == "y":
|
|
30
|
+
total += quantity * 525600
|
|
31
|
+
return total if total > 0 else 240.0
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ConformalIntervalTrainer:
|
|
35
|
+
model_name = "interval_conformal_calibrator"
|
|
36
|
+
artifact_path = Path("models") / "interval_conformal_calibrator.joblib"
|
|
37
|
+
|
|
38
|
+
def run(self, days: int = 180) -> dict:
|
|
39
|
+
start = time.time()
|
|
40
|
+
outcomes = load_target_outcomes(days)
|
|
41
|
+
if len(outcomes) < 60:
|
|
42
|
+
return {
|
|
43
|
+
"model": self.model_name,
|
|
44
|
+
"status": "skipped",
|
|
45
|
+
"metrics": {"samples": len(outcomes)},
|
|
46
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
47
|
+
"artifact_path": "",
|
|
48
|
+
"error": f"Need at least 60 samples, got {len(outcomes)}",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
rows = []
|
|
52
|
+
for outcome in outcomes:
|
|
53
|
+
features = dict(outcome["features"])
|
|
54
|
+
features["probability_hint"] = float(outcome.get("probability", 0.5))
|
|
55
|
+
features["horizon_minutes"] = _horizon_to_minutes(str(outcome.get("horizon", "4h")))
|
|
56
|
+
rows.append(features)
|
|
57
|
+
|
|
58
|
+
feature_keys = ALL_FEATURE_KEYS + ["probability_hint", "horizon_minutes"]
|
|
59
|
+
X = np.array([[row.get(key, 0.0) for key in feature_keys] for row in rows], dtype=np.float32)
|
|
60
|
+
y = np.array([float(outcome["changePct"]) for outcome in outcomes], dtype=np.float32)
|
|
61
|
+
|
|
62
|
+
n = len(X)
|
|
63
|
+
train_end = int(n * 0.70)
|
|
64
|
+
cal_end = int(n * 0.85)
|
|
65
|
+
X_train, X_cal = X[:train_end], X[train_end:cal_end]
|
|
66
|
+
y_train, y_cal = y[:train_end], y[train_end:cal_end]
|
|
67
|
+
X_test, y_test = X[cal_end:], y[cal_end:]
|
|
68
|
+
|
|
69
|
+
base = GradientBoostingRegressor(
|
|
70
|
+
loss="absolute_error", n_estimators=250, max_depth=3, random_state=42
|
|
71
|
+
)
|
|
72
|
+
base.fit(X_train, y_train)
|
|
73
|
+
|
|
74
|
+
cal_preds = base.predict(X_cal)
|
|
75
|
+
residuals = y_cal - cal_preds
|
|
76
|
+
lower = float(np.quantile(residuals, 0.10))
|
|
77
|
+
upper = float(np.quantile(residuals, 0.90))
|
|
78
|
+
|
|
79
|
+
test_preds = base.predict(X_test)
|
|
80
|
+
coverage = float(np.mean((y_test >= test_preds + lower) & (y_test <= test_preds + upper)))
|
|
81
|
+
|
|
82
|
+
self.artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
joblib.dump(
|
|
84
|
+
{
|
|
85
|
+
"lower_residual_pct": lower,
|
|
86
|
+
"upper_residual_pct": upper,
|
|
87
|
+
"coverage": coverage,
|
|
88
|
+
"trained_at": str(int(time.time())),
|
|
89
|
+
},
|
|
90
|
+
self.artifact_path,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
return {
|
|
94
|
+
"model": self.model_name,
|
|
95
|
+
"status": "success",
|
|
96
|
+
"metrics": {
|
|
97
|
+
"lower_residual_pct": lower,
|
|
98
|
+
"upper_residual_pct": upper,
|
|
99
|
+
"coverage": coverage,
|
|
100
|
+
"train_samples": len(X_train),
|
|
101
|
+
"calibration_samples": len(X_cal),
|
|
102
|
+
"test_samples": len(X_test),
|
|
103
|
+
},
|
|
104
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
105
|
+
"artifact_path": str(self.artifact_path),
|
|
106
|
+
}
|