@vizzor/cli 0.13.1 → 0.14.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +251 -192
- package/chronovisor-engine/pyproject.toml +31 -0
- package/chronovisor-engine/src/__init__.py +0 -0
- package/chronovisor-engine/src/inference/__init__.py +0 -0
- package/chronovisor-engine/src/inference/predict.py +44 -0
- package/chronovisor-engine/src/model_catalog.py +219 -0
- package/chronovisor-engine/src/models/__init__.py +0 -0
- package/chronovisor-engine/src/models/anomaly_detector.py +104 -0
- package/chronovisor-engine/src/models/blockchain_cycle_analyzer.py +217 -0
- package/chronovisor-engine/src/models/catalyst_event_model.py +70 -0
- package/chronovisor-engine/src/models/conformal_interval.py +50 -0
- package/chronovisor-engine/src/models/divergence_detector.py +247 -0
- package/chronovisor-engine/src/models/drift_monitor.py +51 -0
- package/chronovisor-engine/src/models/intent_classifier.py +189 -0
- package/chronovisor-engine/src/models/lstm_predictor.py +143 -0
- package/chronovisor-engine/src/models/microstructure_specialist.py +65 -0
- package/chronovisor-engine/src/models/narrative_detector.py +418 -0
- package/chronovisor-engine/src/models/portfolio_optimizer.py +162 -0
- package/chronovisor-engine/src/models/project_risk_scorer.py +184 -0
- package/chronovisor-engine/src/models/pump_detector.py +344 -0
- package/chronovisor-engine/src/models/regime_detector.py +127 -0
- package/chronovisor-engine/src/models/rug_detector.py +197 -0
- package/chronovisor-engine/src/models/sentiment_analyzer.py +257 -0
- package/chronovisor-engine/src/models/signal_classifier.py +191 -0
- package/chronovisor-engine/src/models/stacking_meta.py +56 -0
- package/chronovisor-engine/src/models/strategy_bandit.py +191 -0
- package/chronovisor-engine/src/models/ta_interpreter.py +341 -0
- package/chronovisor-engine/src/models/target_quantile.py +96 -0
- package/chronovisor-engine/src/models/trend_scorer.py +107 -0
- package/chronovisor-engine/src/models/wallet_classifier.py +261 -0
- package/chronovisor-engine/src/server.py +1686 -0
- package/chronovisor-engine/src/training/__init__.py +0 -0
- package/chronovisor-engine/src/training/data_loader.py +635 -0
- package/chronovisor-engine/src/training/pipeline.py +130 -0
- package/chronovisor-engine/src/training/train_catalyst.py +169 -0
- package/chronovisor-engine/src/training/train_classifier.py +159 -0
- package/chronovisor-engine/src/training/train_conformal.py +106 -0
- package/chronovisor-engine/src/training/train_direction.py +215 -0
- package/chronovisor-engine/src/training/train_drift.py +57 -0
- package/chronovisor-engine/src/training/train_isotonic.py +58 -0
- package/chronovisor-engine/src/training/train_lstm.py +217 -0
- package/chronovisor-engine/src/training/train_microstructure.py +102 -0
- package/chronovisor-engine/src/training/train_narrative.py +168 -0
- package/chronovisor-engine/src/training/train_pump.py +109 -0
- package/chronovisor-engine/src/training/train_regime.py +116 -0
- package/chronovisor-engine/src/training/train_rug.py +58 -0
- package/chronovisor-engine/src/training/train_sentiment.py +63 -0
- package/chronovisor-engine/src/training/train_stacking_meta.py +74 -0
- package/chronovisor-engine/src/training/train_target_quantile.py +115 -0
- package/chronovisor-engine/src/training/train_trend.py +101 -0
- package/dist/index.js +22494 -15023
- package/dist/index.js.map +1 -1
- package/package.json +5 -1
- package/vizzor_logodarkicon.png +0 -0
- package/vizzor_logoicon.png +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Training pipeline for narrative detection model."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .pipeline import TrainingPipeline
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
# Sample narrative corpus for pipeline testing
|
|
12
|
+
SAMPLE_NARRATIVES = {
|
|
13
|
+
"ai_crypto": [
|
|
14
|
+
"AI agents are revolutionizing crypto trading with new inference tokens",
|
|
15
|
+
"Decentralized AI compute networks see massive growth in TVL",
|
|
16
|
+
"New LLM-powered trading bots leverage on-chain data for alpha",
|
|
17
|
+
],
|
|
18
|
+
"rwa": [
|
|
19
|
+
"Tokenized treasury bills gain institutional adoption as yields rise",
|
|
20
|
+
"Real world asset protocols see record inflows from TradFi",
|
|
21
|
+
"Ondo Finance launches new tokenized bond product on Ethereum",
|
|
22
|
+
],
|
|
23
|
+
"depin": [
|
|
24
|
+
"DePIN networks expand hardware infrastructure with new sensor deployments",
|
|
25
|
+
"Helium and Hivemapper lead physical infrastructure narrative growth",
|
|
26
|
+
"Render Network compute demand surges amid AI training requirements",
|
|
27
|
+
],
|
|
28
|
+
"meme": [
|
|
29
|
+
"Memecoin season returns as PEPE and WIF hit new highs on Solana",
|
|
30
|
+
"Pump.fun launches see degen traders ape into new fair launch tokens",
|
|
31
|
+
"Community-driven meme tokens dominate social media sentiment",
|
|
32
|
+
],
|
|
33
|
+
"defi_revival": [
|
|
34
|
+
"DeFi TVL crosses $100B as Aave and Uniswap report record volumes",
|
|
35
|
+
"Lending protocols see yield compression as liquidity floods back",
|
|
36
|
+
"Perpetual DEX volumes surpass centralized exchange competitors",
|
|
37
|
+
],
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def build_narrative_corpus() -> dict[str, np.ndarray]:
|
|
42
|
+
"""Build training corpus from crypto news sources.
|
|
43
|
+
|
|
44
|
+
Generates TF-IDF feature vectors from narrative text samples.
|
|
45
|
+
"""
|
|
46
|
+
logger.info("Building narrative detection corpus...")
|
|
47
|
+
|
|
48
|
+
texts: list[str] = []
|
|
49
|
+
labels: list[int] = []
|
|
50
|
+
label_map = list(SAMPLE_NARRATIVES.keys())
|
|
51
|
+
|
|
52
|
+
for label_idx, (narrative, samples) in enumerate(SAMPLE_NARRATIVES.items()):
|
|
53
|
+
# Augment each sample with slight variations
|
|
54
|
+
for sample in samples:
|
|
55
|
+
texts.append(sample)
|
|
56
|
+
labels.append(label_idx)
|
|
57
|
+
# Simple augmentation: shuffle words
|
|
58
|
+
words = sample.split()
|
|
59
|
+
for _ in range(3):
|
|
60
|
+
np.random.shuffle(words)
|
|
61
|
+
texts.append(" ".join(words))
|
|
62
|
+
labels.append(label_idx)
|
|
63
|
+
|
|
64
|
+
# Convert to simple bag-of-words features for pipeline compatibility
|
|
65
|
+
all_words = set()
|
|
66
|
+
for text in texts:
|
|
67
|
+
all_words.update(text.lower().split())
|
|
68
|
+
vocab = sorted(all_words)
|
|
69
|
+
word_to_idx = {w: i for i, w in enumerate(vocab)}
|
|
70
|
+
|
|
71
|
+
n_samples = len(texts)
|
|
72
|
+
n_features = len(vocab)
|
|
73
|
+
X = np.zeros((n_samples, min(n_features, 200)), dtype=np.float32)
|
|
74
|
+
|
|
75
|
+
for i, text in enumerate(texts):
|
|
76
|
+
words = text.lower().split()
|
|
77
|
+
for word in words:
|
|
78
|
+
idx = word_to_idx.get(word, -1)
|
|
79
|
+
if 0 <= idx < X.shape[1]:
|
|
80
|
+
X[i, idx] += 1.0
|
|
81
|
+
|
|
82
|
+
y = np.array(labels, dtype=np.int32)
|
|
83
|
+
return {"X": X, "y": y, "label_map": label_map}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class NarrativeTrainer(TrainingPipeline):
|
|
87
|
+
"""Training pipeline for narrative detection."""
|
|
88
|
+
|
|
89
|
+
def __init__(self) -> None:
|
|
90
|
+
super().__init__("narrative_detector")
|
|
91
|
+
|
|
92
|
+
def load_data(self) -> dict:
|
|
93
|
+
logger.info("Loading narrative detection training data...")
|
|
94
|
+
return build_narrative_corpus()
|
|
95
|
+
|
|
96
|
+
def preprocess(self, data: dict) -> tuple:
|
|
97
|
+
X, y = data["X"], data["y"]
|
|
98
|
+
n = len(X)
|
|
99
|
+
train_end = int(n * 0.7)
|
|
100
|
+
val_end = int(n * 0.85)
|
|
101
|
+
return (
|
|
102
|
+
X[:train_end],
|
|
103
|
+
X[train_end:val_end],
|
|
104
|
+
X[val_end:],
|
|
105
|
+
y[:train_end],
|
|
106
|
+
y[train_end:val_end],
|
|
107
|
+
y[val_end:],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def train(
|
|
111
|
+
self,
|
|
112
|
+
X_train: np.ndarray,
|
|
113
|
+
y_train: np.ndarray,
|
|
114
|
+
X_val: np.ndarray,
|
|
115
|
+
y_val: np.ndarray,
|
|
116
|
+
):
|
|
117
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
118
|
+
|
|
119
|
+
model = RandomForestClassifier(
|
|
120
|
+
n_estimators=100, max_depth=10, random_state=42
|
|
121
|
+
)
|
|
122
|
+
model.fit(X_train, y_train)
|
|
123
|
+
val_acc = model.score(X_val, y_val)
|
|
124
|
+
logger.info(f"Narrative detector validation accuracy: {val_acc:.4f}")
|
|
125
|
+
return model
|
|
126
|
+
|
|
127
|
+
def evaluate(self, model, X_test: np.ndarray, y_test: np.ndarray) -> dict:
|
|
128
|
+
from sklearn.metrics import (
|
|
129
|
+
accuracy_score,
|
|
130
|
+
f1_score,
|
|
131
|
+
precision_score,
|
|
132
|
+
recall_score,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
preds = model.predict(X_test)
|
|
136
|
+
return {
|
|
137
|
+
"accuracy": float(accuracy_score(y_test, preds)),
|
|
138
|
+
"precision": float(
|
|
139
|
+
precision_score(y_test, preds, average="weighted", zero_division=0)
|
|
140
|
+
),
|
|
141
|
+
"recall": float(
|
|
142
|
+
recall_score(y_test, preds, average="weighted", zero_division=0)
|
|
143
|
+
),
|
|
144
|
+
"f1": float(
|
|
145
|
+
f1_score(y_test, preds, average="weighted", zero_division=0)
|
|
146
|
+
),
|
|
147
|
+
"test_samples": len(y_test),
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def save(self, model, metrics: dict) -> str:
|
|
151
|
+
import joblib
|
|
152
|
+
|
|
153
|
+
path = super().save(model, metrics)
|
|
154
|
+
artifact_path = path.replace(".pkl", "_model.pkl")
|
|
155
|
+
joblib.dump(model, artifact_path)
|
|
156
|
+
return artifact_path
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def train_narrative_detector(
|
|
160
|
+
data_dir: str = "data/narrative", output_dir: str = "models/"
|
|
161
|
+
) -> dict:
|
|
162
|
+
"""Train narrative detection model."""
|
|
163
|
+
trainer = NarrativeTrainer()
|
|
164
|
+
return trainer.run()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
if __name__ == "__main__":
|
|
168
|
+
train_narrative_detector()
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Training pipeline for pump detection using real microstructure history."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .data_loader import load_pump_training_frame
|
|
9
|
+
from .pipeline import TrainingPipeline
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PumpTrainer(TrainingPipeline):
|
|
15
|
+
"""Train a classifier over short-horizon pump/dump microstructure features."""
|
|
16
|
+
|
|
17
|
+
FEATURE_KEYS = ["return_1", "volume_ratio", "cusum_up", "cusum_down", "volatility_5"]
|
|
18
|
+
|
|
19
|
+
def __init__(self) -> None:
|
|
20
|
+
super().__init__("pump_detector")
|
|
21
|
+
|
|
22
|
+
def load_data(self):
|
|
23
|
+
logger.info("Loading real pump/dump training data from 1m OHLCV history...")
|
|
24
|
+
frame = load_pump_training_frame(days=30, timeframe="1m")
|
|
25
|
+
if frame.empty:
|
|
26
|
+
raise RuntimeError("No historical pump training data available")
|
|
27
|
+
return frame
|
|
28
|
+
|
|
29
|
+
def preprocess(self, data):
|
|
30
|
+
X = data[self.FEATURE_KEYS].fillna(0.0).astype(np.float32).values
|
|
31
|
+
y = data["y"].astype(np.int64).values
|
|
32
|
+
n = len(X)
|
|
33
|
+
train_end = int(n * 0.70)
|
|
34
|
+
val_end = int(n * 0.85)
|
|
35
|
+
return (
|
|
36
|
+
X[:train_end],
|
|
37
|
+
X[train_end:val_end],
|
|
38
|
+
X[val_end:],
|
|
39
|
+
y[:train_end],
|
|
40
|
+
y[train_end:val_end],
|
|
41
|
+
y[val_end:],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def train(self, X_train, y_train, X_val, y_val):
|
|
45
|
+
try:
|
|
46
|
+
import xgboost as xgb
|
|
47
|
+
|
|
48
|
+
model = xgb.XGBClassifier(
|
|
49
|
+
n_estimators=250,
|
|
50
|
+
max_depth=4,
|
|
51
|
+
learning_rate=0.05,
|
|
52
|
+
subsample=0.85,
|
|
53
|
+
colsample_bytree=0.85,
|
|
54
|
+
objective="multi:softprob",
|
|
55
|
+
num_class=3,
|
|
56
|
+
eval_metric="mlogloss",
|
|
57
|
+
early_stopping_rounds=20,
|
|
58
|
+
random_state=42,
|
|
59
|
+
)
|
|
60
|
+
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
|
|
61
|
+
return {"model": model, "engine": "xgboost"}
|
|
62
|
+
except Exception:
|
|
63
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
64
|
+
|
|
65
|
+
model = GradientBoostingClassifier(n_estimators=180, max_depth=4, random_state=42)
|
|
66
|
+
model.fit(X_train, y_train)
|
|
67
|
+
return {"model": model, "engine": "gradient_boosting"}
|
|
68
|
+
|
|
69
|
+
def evaluate(self, trained, X_test, y_test):
|
|
70
|
+
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
71
|
+
|
|
72
|
+
model = trained["model"]
|
|
73
|
+
preds = model.predict(X_test)
|
|
74
|
+
return {
|
|
75
|
+
"accuracy": float(accuracy_score(y_test, preds)),
|
|
76
|
+
"precision": float(
|
|
77
|
+
precision_score(y_test, preds, average="weighted", zero_division=0)
|
|
78
|
+
),
|
|
79
|
+
"recall": float(recall_score(y_test, preds, average="weighted", zero_division=0)),
|
|
80
|
+
"f1": float(f1_score(y_test, preds, average="weighted", zero_division=0)),
|
|
81
|
+
"engine": trained["engine"],
|
|
82
|
+
"test_samples": len(y_test),
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
def save(self, trained, metrics):
|
|
86
|
+
import joblib
|
|
87
|
+
|
|
88
|
+
super().save(trained, metrics)
|
|
89
|
+
artifact_path = self.artifact_dir.parent / "pump_detector.joblib"
|
|
90
|
+
joblib.dump(
|
|
91
|
+
{
|
|
92
|
+
"model": trained["model"],
|
|
93
|
+
"engine": trained["engine"],
|
|
94
|
+
"trained_at": str(int(time.time())),
|
|
95
|
+
"accuracy": metrics.get("accuracy"),
|
|
96
|
+
"feature_keys": self.FEATURE_KEYS,
|
|
97
|
+
},
|
|
98
|
+
artifact_path,
|
|
99
|
+
)
|
|
100
|
+
return str(artifact_path)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def train_pump_detector() -> dict:
|
|
104
|
+
trainer = PumpTrainer()
|
|
105
|
+
return trainer.run()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
train_pump_detector()
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Regime detector training on real historical market-state features."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from .data_loader import load_regime_training_frame
|
|
9
|
+
from .pipeline import TrainingPipeline
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
REGIMES = ["trending_bull", "trending_bear", "ranging", "volatile", "capitulation"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RegimeTrainer(TrainingPipeline):
|
|
17
|
+
FEATURE_KEYS = [
|
|
18
|
+
"returns_1d",
|
|
19
|
+
"returns_7d",
|
|
20
|
+
"volatility_14d",
|
|
21
|
+
"volume_ratio",
|
|
22
|
+
"rsi",
|
|
23
|
+
"bb_width",
|
|
24
|
+
"fear_greed",
|
|
25
|
+
"funding_rate",
|
|
26
|
+
"price_vs_sma200",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
super().__init__("regime_detector")
|
|
31
|
+
|
|
32
|
+
def load_data(self):
|
|
33
|
+
logger.info("Loading real regime training data from OHLCV history...")
|
|
34
|
+
frame = load_regime_training_frame(days=300, timeframe="4h")
|
|
35
|
+
if frame.empty:
|
|
36
|
+
raise RuntimeError("No historical regime training data available")
|
|
37
|
+
return frame
|
|
38
|
+
|
|
39
|
+
def preprocess(self, data):
|
|
40
|
+
data = data.sort_index().reset_index(drop=True)
|
|
41
|
+
X = data[self.FEATURE_KEYS].fillna(0.0).astype(np.float32).values
|
|
42
|
+
y = data["y"].map({name: idx for idx, name in enumerate(REGIMES)}).astype(np.int64).values
|
|
43
|
+
|
|
44
|
+
n = len(X)
|
|
45
|
+
train_end = int(n * 0.70)
|
|
46
|
+
val_end = int(n * 0.85)
|
|
47
|
+
return X[:train_end], X[train_end:val_end], X[val_end:], y[:train_end], y[train_end:val_end], y[val_end:]
|
|
48
|
+
|
|
49
|
+
def train(self, X_train, y_train, X_val, y_val):
|
|
50
|
+
try:
|
|
51
|
+
import xgboost as xgb
|
|
52
|
+
|
|
53
|
+
model = xgb.XGBClassifier(
|
|
54
|
+
n_estimators=250,
|
|
55
|
+
max_depth=4,
|
|
56
|
+
learning_rate=0.05,
|
|
57
|
+
subsample=0.85,
|
|
58
|
+
colsample_bytree=0.85,
|
|
59
|
+
min_child_weight=3,
|
|
60
|
+
objective="multi:softprob",
|
|
61
|
+
num_class=len(REGIMES),
|
|
62
|
+
eval_metric="mlogloss",
|
|
63
|
+
early_stopping_rounds=25,
|
|
64
|
+
random_state=42,
|
|
65
|
+
)
|
|
66
|
+
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
|
|
67
|
+
return {"model": model, "engine": "xgboost"}
|
|
68
|
+
except Exception:
|
|
69
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
70
|
+
|
|
71
|
+
model = RandomForestClassifier(
|
|
72
|
+
n_estimators=250,
|
|
73
|
+
max_depth=10,
|
|
74
|
+
min_samples_leaf=4,
|
|
75
|
+
class_weight="balanced_subsample",
|
|
76
|
+
random_state=42,
|
|
77
|
+
n_jobs=-1,
|
|
78
|
+
)
|
|
79
|
+
model.fit(X_train, y_train)
|
|
80
|
+
return {"model": model, "engine": "random_forest"}
|
|
81
|
+
|
|
82
|
+
def evaluate(self, trained, X_test, y_test):
|
|
83
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
84
|
+
|
|
85
|
+
model = trained["model"]
|
|
86
|
+
preds = model.predict(X_test)
|
|
87
|
+
report = classification_report(
|
|
88
|
+
y_test,
|
|
89
|
+
preds,
|
|
90
|
+
target_names=REGIMES,
|
|
91
|
+
output_dict=True,
|
|
92
|
+
zero_division=0,
|
|
93
|
+
)
|
|
94
|
+
return {
|
|
95
|
+
"accuracy": float(accuracy_score(y_test, preds)),
|
|
96
|
+
"per_class": {k: v for k, v in report.items() if k in REGIMES},
|
|
97
|
+
"engine": trained["engine"],
|
|
98
|
+
"test_samples": len(y_test),
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
def save(self, trained, metrics):
|
|
102
|
+
import joblib
|
|
103
|
+
|
|
104
|
+
super().save(trained, metrics)
|
|
105
|
+
artifact_path = self.artifact_dir.parent / "regime_detector.joblib"
|
|
106
|
+
joblib.dump(
|
|
107
|
+
{
|
|
108
|
+
"model": trained["model"],
|
|
109
|
+
"engine": trained["engine"],
|
|
110
|
+
"trained_at": str(int(time.time())),
|
|
111
|
+
"accuracy": metrics.get("accuracy"),
|
|
112
|
+
"feature_keys": self.FEATURE_KEYS,
|
|
113
|
+
},
|
|
114
|
+
artifact_path,
|
|
115
|
+
)
|
|
116
|
+
return str(artifact_path)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Rug detection model training script."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import numpy as np
|
|
5
|
+
from .pipeline import TrainingPipeline
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RugTrainer(TrainingPipeline):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
super().__init__("rug_detector")
|
|
13
|
+
|
|
14
|
+
def load_data(self):
|
|
15
|
+
# Load from feature store / CSV / database
|
|
16
|
+
logger.info("Loading rug detection training data...")
|
|
17
|
+
# Placeholder: generate synthetic data for pipeline testing
|
|
18
|
+
n = 1000
|
|
19
|
+
X = np.random.randn(n, 15).astype(np.float32)
|
|
20
|
+
y = (X[:, 0] + X[:, 2] + X[:, 4] > 1.5).astype(np.int32)
|
|
21
|
+
return {"X": X, "y": y}
|
|
22
|
+
|
|
23
|
+
def preprocess(self, data):
|
|
24
|
+
X, y = data["X"], data["y"]
|
|
25
|
+
n = len(X)
|
|
26
|
+
train_end = int(n * 0.7)
|
|
27
|
+
val_end = int(n * 0.85)
|
|
28
|
+
return (
|
|
29
|
+
X[:train_end], X[train_end:val_end], X[val_end:],
|
|
30
|
+
y[:train_end], y[train_end:val_end], y[val_end:],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
def train(self, X_train, y_train, X_val, y_val):
|
|
34
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
|
35
|
+
model = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
|
|
36
|
+
model.fit(X_train, y_train)
|
|
37
|
+
val_acc = model.score(X_val, y_val)
|
|
38
|
+
logger.info(f"Validation accuracy: {val_acc:.4f}")
|
|
39
|
+
return model
|
|
40
|
+
|
|
41
|
+
def evaluate(self, model, X_test, y_test):
|
|
42
|
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
|
|
43
|
+
preds = model.predict(X_test)
|
|
44
|
+
return {
|
|
45
|
+
"accuracy": float(accuracy_score(y_test, preds)),
|
|
46
|
+
"precision": float(precision_score(y_test, preds, zero_division=0)),
|
|
47
|
+
"recall": float(recall_score(y_test, preds, zero_division=0)),
|
|
48
|
+
"f1": float(f1_score(y_test, preds, zero_division=0)),
|
|
49
|
+
"test_samples": len(y_test),
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def save(self, model, metrics):
|
|
53
|
+
import joblib
|
|
54
|
+
|
|
55
|
+
super().save(model, metrics)
|
|
56
|
+
artifact_path = self.artifact_dir.parent / "rug_detector.joblib"
|
|
57
|
+
joblib.dump(model, artifact_path)
|
|
58
|
+
return str(artifact_path)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Sentiment NLP model training script."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
import numpy as np
|
|
6
|
+
from .pipeline import TrainingPipeline
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SentimentTrainer(TrainingPipeline):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
super().__init__("sentiment_nlp")
|
|
14
|
+
|
|
15
|
+
def load_data(self):
|
|
16
|
+
logger.info("Loading sentiment training data...")
|
|
17
|
+
n = 500
|
|
18
|
+
X = [f"Headline {i} about crypto" for i in range(n)]
|
|
19
|
+
y = np.random.choice(["bullish", "bearish", "neutral"], size=n)
|
|
20
|
+
return {"X": X, "y": y}
|
|
21
|
+
|
|
22
|
+
def preprocess(self, data):
|
|
23
|
+
X, y = data["X"], data["y"]
|
|
24
|
+
n = len(X)
|
|
25
|
+
t1, t2 = int(n * 0.7), int(n * 0.85)
|
|
26
|
+
return X[:t1], X[t1:t2], X[t2:], y[:t1], y[t1:t2], y[t2:]
|
|
27
|
+
|
|
28
|
+
def train(self, X_train, y_train, X_val, y_val):
|
|
29
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
30
|
+
from sklearn.linear_model import LogisticRegression
|
|
31
|
+
from sklearn.pipeline import Pipeline
|
|
32
|
+
|
|
33
|
+
model = Pipeline([
|
|
34
|
+
("tfidf", TfidfVectorizer(max_features=5000)),
|
|
35
|
+
("clf", LogisticRegression(max_iter=1000)),
|
|
36
|
+
])
|
|
37
|
+
model.fit(X_train, y_train)
|
|
38
|
+
val_acc = model.score(X_val, y_val)
|
|
39
|
+
logger.info(f"Validation accuracy: {val_acc:.4f}")
|
|
40
|
+
return model
|
|
41
|
+
|
|
42
|
+
def evaluate(self, model, X_test, y_test):
|
|
43
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
44
|
+
preds = model.predict(X_test)
|
|
45
|
+
return {
|
|
46
|
+
"accuracy": float(accuracy_score(y_test, preds)),
|
|
47
|
+
"test_samples": len(y_test),
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def save(self, model, metrics):
|
|
51
|
+
import joblib
|
|
52
|
+
|
|
53
|
+
super().save(model, metrics)
|
|
54
|
+
artifact_path = self.artifact_dir.parent / "sentiment_nlp.joblib"
|
|
55
|
+
joblib.dump(
|
|
56
|
+
{
|
|
57
|
+
"model": model,
|
|
58
|
+
"trained_at": str(int(time.time())),
|
|
59
|
+
"accuracy": metrics.get("accuracy"),
|
|
60
|
+
},
|
|
61
|
+
artifact_path,
|
|
62
|
+
)
|
|
63
|
+
return str(artifact_path)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Stacking meta-confidence trainer from resolved prediction history."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from sklearn.linear_model import LogisticRegression
|
|
9
|
+
from sklearn.metrics import accuracy_score, roc_auc_score
|
|
10
|
+
|
|
11
|
+
from .data_loader import load_meta_prediction_frame
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StackingMetaTrainer:
|
|
15
|
+
model_name = "meta_stacking"
|
|
16
|
+
artifact_path = Path("models") / "meta_stacking.joblib"
|
|
17
|
+
|
|
18
|
+
def run(self, days: int = 180) -> dict:
|
|
19
|
+
start = time.time()
|
|
20
|
+
frame = load_meta_prediction_frame(days)
|
|
21
|
+
if frame.empty or len(frame) < 50:
|
|
22
|
+
return {
|
|
23
|
+
"model": self.model_name,
|
|
24
|
+
"status": "skipped",
|
|
25
|
+
"metrics": {"samples": len(frame)},
|
|
26
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
27
|
+
"artifact_path": "",
|
|
28
|
+
"error": "Need at least 50 resolved predictions",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
encoded = frame.copy()
|
|
32
|
+
y = encoded.pop("was_correct").astype(int)
|
|
33
|
+
encoded = encoded.drop(columns=["actual_change_pct"])
|
|
34
|
+
encoded = encoded.fillna(0.0)
|
|
35
|
+
encoded = encoded.astype({"model": "string", "horizon": "string"})
|
|
36
|
+
encoded = pd.get_dummies(encoded, columns=["model", "horizon"], dtype=float)
|
|
37
|
+
|
|
38
|
+
n = len(encoded)
|
|
39
|
+
split = int(n * 0.80)
|
|
40
|
+
X_train, X_test = encoded.iloc[:split], encoded.iloc[split:]
|
|
41
|
+
y_train, y_test = y.iloc[:split], y.iloc[split:]
|
|
42
|
+
|
|
43
|
+
model = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
|
|
44
|
+
model.fit(X_train, y_train)
|
|
45
|
+
|
|
46
|
+
proba = model.predict_proba(X_test)[:, 1]
|
|
47
|
+
pred = (proba >= 0.5).astype(int)
|
|
48
|
+
accuracy = float(accuracy_score(y_test, pred))
|
|
49
|
+
auc = float(roc_auc_score(y_test, proba)) if len(set(y_test)) > 1 else 0.5
|
|
50
|
+
|
|
51
|
+
self.artifact_path.parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
joblib.dump(
|
|
53
|
+
{
|
|
54
|
+
"model": model,
|
|
55
|
+
"columns": list(X_train.columns),
|
|
56
|
+
"trained_at": str(int(time.time())),
|
|
57
|
+
"accuracy": accuracy,
|
|
58
|
+
"auc": auc,
|
|
59
|
+
},
|
|
60
|
+
self.artifact_path,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
"model": self.model_name,
|
|
65
|
+
"status": "success",
|
|
66
|
+
"metrics": {
|
|
67
|
+
"accuracy": accuracy,
|
|
68
|
+
"auc": auc,
|
|
69
|
+
"train_samples": len(X_train),
|
|
70
|
+
"test_samples": len(X_test),
|
|
71
|
+
},
|
|
72
|
+
"duration_seconds": round(time.time() - start, 2),
|
|
73
|
+
"artifact_path": str(self.artifact_path),
|
|
74
|
+
}
|