@vizzor/cli 0.13.0 → 0.14.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +250 -191
- package/chronovisor-engine/pyproject.toml +31 -0
- package/chronovisor-engine/src/__init__.py +0 -0
- package/chronovisor-engine/src/inference/__init__.py +0 -0
- package/chronovisor-engine/src/inference/predict.py +44 -0
- package/chronovisor-engine/src/model_catalog.py +219 -0
- package/chronovisor-engine/src/models/__init__.py +0 -0
- package/chronovisor-engine/src/models/anomaly_detector.py +104 -0
- package/chronovisor-engine/src/models/blockchain_cycle_analyzer.py +217 -0
- package/chronovisor-engine/src/models/catalyst_event_model.py +70 -0
- package/chronovisor-engine/src/models/conformal_interval.py +50 -0
- package/chronovisor-engine/src/models/divergence_detector.py +247 -0
- package/chronovisor-engine/src/models/drift_monitor.py +51 -0
- package/chronovisor-engine/src/models/intent_classifier.py +189 -0
- package/chronovisor-engine/src/models/lstm_predictor.py +143 -0
- package/chronovisor-engine/src/models/microstructure_specialist.py +65 -0
- package/chronovisor-engine/src/models/narrative_detector.py +418 -0
- package/chronovisor-engine/src/models/portfolio_optimizer.py +162 -0
- package/chronovisor-engine/src/models/project_risk_scorer.py +184 -0
- package/chronovisor-engine/src/models/pump_detector.py +344 -0
- package/chronovisor-engine/src/models/regime_detector.py +127 -0
- package/chronovisor-engine/src/models/rug_detector.py +197 -0
- package/chronovisor-engine/src/models/sentiment_analyzer.py +257 -0
- package/chronovisor-engine/src/models/signal_classifier.py +191 -0
- package/chronovisor-engine/src/models/stacking_meta.py +56 -0
- package/chronovisor-engine/src/models/strategy_bandit.py +191 -0
- package/chronovisor-engine/src/models/ta_interpreter.py +341 -0
- package/chronovisor-engine/src/models/target_quantile.py +96 -0
- package/chronovisor-engine/src/models/trend_scorer.py +107 -0
- package/chronovisor-engine/src/models/wallet_classifier.py +261 -0
- package/chronovisor-engine/src/server.py +1686 -0
- package/chronovisor-engine/src/training/__init__.py +0 -0
- package/chronovisor-engine/src/training/data_loader.py +635 -0
- package/chronovisor-engine/src/training/pipeline.py +130 -0
- package/chronovisor-engine/src/training/train_catalyst.py +169 -0
- package/chronovisor-engine/src/training/train_classifier.py +159 -0
- package/chronovisor-engine/src/training/train_conformal.py +106 -0
- package/chronovisor-engine/src/training/train_direction.py +215 -0
- package/chronovisor-engine/src/training/train_drift.py +57 -0
- package/chronovisor-engine/src/training/train_isotonic.py +58 -0
- package/chronovisor-engine/src/training/train_lstm.py +217 -0
- package/chronovisor-engine/src/training/train_microstructure.py +102 -0
- package/chronovisor-engine/src/training/train_narrative.py +168 -0
- package/chronovisor-engine/src/training/train_pump.py +109 -0
- package/chronovisor-engine/src/training/train_regime.py +116 -0
- package/chronovisor-engine/src/training/train_rug.py +58 -0
- package/chronovisor-engine/src/training/train_sentiment.py +63 -0
- package/chronovisor-engine/src/training/train_stacking_meta.py +74 -0
- package/chronovisor-engine/src/training/train_target_quantile.py +115 -0
- package/chronovisor-engine/src/training/train_trend.py +101 -0
- package/dist/index.js +23803 -14468
- package/dist/index.js.map +1 -1
- package/package.json +6 -4
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""LSTM time-series predictor for price direction.
|
|
2
|
+
|
|
3
|
+
Input: 100-candle windows with 8 per-candle features (close, rsi, macdHist,
|
|
4
|
+
bbPercentB, atr, obv, funding, fearGreed) → shape (1, 100, 8)
|
|
5
|
+
Output: price direction probability over 1h/4h/1d horizons
|
|
6
|
+
|
|
7
|
+
Trained on historical klines from PostgreSQL via training/train_lstm.py.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
16
|
+
|
|
17
|
+
# Per-candle features expected by the LSTM: 8 features per timestep.
|
|
18
|
+
# The training pipeline (train_lstm.py) creates sequences with these columns.
|
|
19
|
+
PER_CANDLE_FEATURES = ["close", "rsi", "macdHist", "bbPercentB", "atr", "obv", "funding", "fearGreed"]
|
|
20
|
+
NUM_FEATURES = len(PER_CANDLE_FEATURES)
|
|
21
|
+
SEQUENCE_LENGTH = 100
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class LSTMPredictor:
|
|
25
|
+
def __init__(self):
|
|
26
|
+
self.version = "0.2.0"
|
|
27
|
+
self.is_loaded = False
|
|
28
|
+
self.last_trained: str | None = None
|
|
29
|
+
self.accuracy: float | None = None
|
|
30
|
+
self.model = None
|
|
31
|
+
self._scaler = None
|
|
32
|
+
|
|
33
|
+
def load(self):
|
|
34
|
+
"""Load trained LSTM model from disk, or initialize with heuristic fallback."""
|
|
35
|
+
model_path = MODEL_DIR / "lstm_predictor.pt"
|
|
36
|
+
scaler_path = MODEL_DIR / "lstm_predictor_scaler.joblib"
|
|
37
|
+
if model_path.exists():
|
|
38
|
+
try:
|
|
39
|
+
import torch
|
|
40
|
+
|
|
41
|
+
self.model = torch.load(model_path, weights_only=True)
|
|
42
|
+
self.is_loaded = True
|
|
43
|
+
self.last_trained = str(model_path.stat().st_mtime)
|
|
44
|
+
# Load fitted scaler if available
|
|
45
|
+
if scaler_path.exists():
|
|
46
|
+
try:
|
|
47
|
+
import joblib
|
|
48
|
+
self._scaler = joblib.load(scaler_path)
|
|
49
|
+
except Exception:
|
|
50
|
+
self._scaler = None
|
|
51
|
+
except Exception:
|
|
52
|
+
self._init_heuristic()
|
|
53
|
+
else:
|
|
54
|
+
self._init_heuristic()
|
|
55
|
+
|
|
56
|
+
def _init_heuristic(self):
|
|
57
|
+
"""Fallback: use a simple heuristic until trained model is available."""
|
|
58
|
+
self.is_loaded = True
|
|
59
|
+
self.version = "0.2.0-heuristic"
|
|
60
|
+
|
|
61
|
+
def predict(self, ohlcv_window: list[dict], indicators: dict) -> dict:
|
|
62
|
+
"""Predict price direction from OHLCV + indicators.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
dict with keys: direction, probability, model
|
|
66
|
+
"""
|
|
67
|
+
if self.model is not None:
|
|
68
|
+
return self._predict_model(ohlcv_window, indicators)
|
|
69
|
+
return self._predict_heuristic(indicators)
|
|
70
|
+
|
|
71
|
+
def _predict_model(self, ohlcv_window: list[dict], indicators: dict) -> dict:
|
|
72
|
+
"""Run inference through trained LSTM model.
|
|
73
|
+
|
|
74
|
+
Restructures input to proper (1, T, F) shape where T=100 timesteps
|
|
75
|
+
and F=8 per-candle features, instead of the incorrect (1, 1, N) shape
|
|
76
|
+
that treated all features as a single timestep.
|
|
77
|
+
"""
|
|
78
|
+
import torch
|
|
79
|
+
|
|
80
|
+
candles = ohlcv_window[-SEQUENCE_LENGTH:]
|
|
81
|
+
|
|
82
|
+
# Build per-candle feature matrix: shape (T, F)
|
|
83
|
+
# Each candle gets its own feature vector from available OHLCV + indicators
|
|
84
|
+
feature_matrix = []
|
|
85
|
+
for candle in candles:
|
|
86
|
+
row = [
|
|
87
|
+
float(candle.get("close", 0)),
|
|
88
|
+
float(candle.get("rsi", indicators.get("rsi", 50))),
|
|
89
|
+
float(candle.get("macdHist", indicators.get("macdHistogram", 0))),
|
|
90
|
+
float(candle.get("bbPercentB", indicators.get("bollingerPercentB", 0.5))),
|
|
91
|
+
float(candle.get("atr", indicators.get("atr", 0))),
|
|
92
|
+
float(candle.get("obv", indicators.get("obv", 0))),
|
|
93
|
+
float(candle.get("funding", indicators.get("fundingRate", 0))),
|
|
94
|
+
float(candle.get("fearGreed", indicators.get("fearGreed", 50))),
|
|
95
|
+
]
|
|
96
|
+
feature_matrix.append(row)
|
|
97
|
+
|
|
98
|
+
# Pad if fewer than SEQUENCE_LENGTH candles (repeat first candle)
|
|
99
|
+
while len(feature_matrix) < SEQUENCE_LENGTH:
|
|
100
|
+
feature_matrix.insert(0, feature_matrix[0])
|
|
101
|
+
|
|
102
|
+
arr = np.array(feature_matrix, dtype=np.float32) # (100, 8)
|
|
103
|
+
|
|
104
|
+
# Apply fitted scaler if available (normalize per-feature)
|
|
105
|
+
if self._scaler is not None:
|
|
106
|
+
arr = self._scaler.transform(arr)
|
|
107
|
+
|
|
108
|
+
# Shape: (1, T=100, F=8) — proper LSTM input
|
|
109
|
+
tensor = torch.tensor(arr, dtype=torch.float32).unsqueeze(0)
|
|
110
|
+
|
|
111
|
+
with torch.no_grad():
|
|
112
|
+
output = self.model(tensor)
|
|
113
|
+
probs = torch.softmax(output, dim=-1).squeeze().numpy()
|
|
114
|
+
|
|
115
|
+
directions = ["up", "sideways", "down"]
|
|
116
|
+
idx = int(np.argmax(probs))
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
"direction": directions[idx],
|
|
120
|
+
"probability": float(probs[idx]),
|
|
121
|
+
"model": f"lstm-predictor-{self.version}",
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
def _predict_heuristic(self, indicators: dict) -> dict:
|
|
125
|
+
"""Simple heuristic based on RSI + MACD until model is trained."""
|
|
126
|
+
rsi = indicators.get("rsi", 50)
|
|
127
|
+
macd_hist = indicators.get("macdHistogram", 0)
|
|
128
|
+
|
|
129
|
+
score = 0.0
|
|
130
|
+
if rsi < 30:
|
|
131
|
+
score += 0.3
|
|
132
|
+
elif rsi > 70:
|
|
133
|
+
score -= 0.3
|
|
134
|
+
if macd_hist > 0:
|
|
135
|
+
score += 0.2
|
|
136
|
+
elif macd_hist < 0:
|
|
137
|
+
score -= 0.2
|
|
138
|
+
|
|
139
|
+
if score > 0.1:
|
|
140
|
+
return {"direction": "up", "probability": 0.5 + score, "model": "lstm-heuristic"}
|
|
141
|
+
elif score < -0.1:
|
|
142
|
+
return {"direction": "down", "probability": 0.5 + abs(score), "model": "lstm-heuristic"}
|
|
143
|
+
return {"direction": "sideways", "probability": 0.5, "model": "lstm-heuristic"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Short-horizon microstructure specialist for 1m-15m direction calls."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import joblib
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MicrostructureSpecialist:
|
|
13
|
+
FEATURE_KEYS = [
|
|
14
|
+
"return_1",
|
|
15
|
+
"volume_ratio",
|
|
16
|
+
"range_pct",
|
|
17
|
+
"wick_imbalance",
|
|
18
|
+
"trade_intensity",
|
|
19
|
+
"price_vs_sma20",
|
|
20
|
+
"volatility_5",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
def __init__(self) -> None:
|
|
24
|
+
self.version = "0.1.0"
|
|
25
|
+
self.is_loaded = False
|
|
26
|
+
self.last_trained: str | None = None
|
|
27
|
+
self.accuracy: float | None = None
|
|
28
|
+
self.model = None
|
|
29
|
+
|
|
30
|
+
def load(self) -> None:
|
|
31
|
+
path = MODEL_DIR / "microstructure_specialist.joblib"
|
|
32
|
+
try:
|
|
33
|
+
data = joblib.load(path)
|
|
34
|
+
self.model = data["model"]
|
|
35
|
+
self.last_trained = data.get("trained_at")
|
|
36
|
+
self.accuracy = data.get("accuracy")
|
|
37
|
+
self.is_loaded = True
|
|
38
|
+
except Exception:
|
|
39
|
+
self.model = None
|
|
40
|
+
self.is_loaded = True
|
|
41
|
+
|
|
42
|
+
def predict(self, features: dict) -> dict:
|
|
43
|
+
if self.model is None:
|
|
44
|
+
score = float(features.get("return_1", 0.0)) * 0.8 + (
|
|
45
|
+
float(features.get("volume_ratio", 1.0)) - 1.0
|
|
46
|
+
) * 0.2
|
|
47
|
+
direction = "up" if score > 0.15 else "down" if score < -0.15 else "sideways"
|
|
48
|
+
probability = min(0.7, 0.5 + abs(score) / 5)
|
|
49
|
+
return {
|
|
50
|
+
"direction": direction,
|
|
51
|
+
"probability": round(probability, 4),
|
|
52
|
+
"confidence": round(probability * 100, 2),
|
|
53
|
+
"model": "heuristic-microstructure",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
x = np.array([[features.get(k, 0.0) for k in self.FEATURE_KEYS]], dtype=np.float32)
|
|
57
|
+
proba = self.model.predict_proba(x)[0]
|
|
58
|
+
idx = int(np.argmax(proba))
|
|
59
|
+
direction = {0: "down", 1: "sideways", 2: "up"}.get(idx, "sideways")
|
|
60
|
+
return {
|
|
61
|
+
"direction": direction,
|
|
62
|
+
"probability": round(float(proba[idx]), 4),
|
|
63
|
+
"confidence": round(float(proba[idx]) * 100, 2),
|
|
64
|
+
"model": "microstructure_specialist",
|
|
65
|
+
}
|
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""Narrative detection model using TF-IDF + topic clustering.
|
|
2
|
+
|
|
3
|
+
Identifies trending crypto narratives from text corpora by matching against
|
|
4
|
+
known narrative keyword clusters and scoring by frequency and context.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import math
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
from collections import Counter
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
16
|
+
|
|
17
|
+
KNOWN_NARRATIVES = [
|
|
18
|
+
"ai_crypto",
|
|
19
|
+
"rwa",
|
|
20
|
+
"depin",
|
|
21
|
+
"meme",
|
|
22
|
+
"l2_scaling",
|
|
23
|
+
"restaking",
|
|
24
|
+
"defi_revival",
|
|
25
|
+
"gaming",
|
|
26
|
+
"regulation",
|
|
27
|
+
"btc_ecosystem",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
# Keyword dictionaries per narrative — used for TF-IDF matching
|
|
31
|
+
NARRATIVE_KEYWORDS: dict[str, list[str]] = {
|
|
32
|
+
"ai_crypto": [
|
|
33
|
+
"artificial intelligence",
|
|
34
|
+
"machine learning",
|
|
35
|
+
"ai agent",
|
|
36
|
+
"neural",
|
|
37
|
+
"llm",
|
|
38
|
+
"gpt",
|
|
39
|
+
"generative",
|
|
40
|
+
"ai token",
|
|
41
|
+
"ai crypto",
|
|
42
|
+
"compute",
|
|
43
|
+
"inference",
|
|
44
|
+
"training data",
|
|
45
|
+
"decentralized ai",
|
|
46
|
+
"ai blockchain",
|
|
47
|
+
"openai",
|
|
48
|
+
"anthropic",
|
|
49
|
+
],
|
|
50
|
+
"rwa": [
|
|
51
|
+
"real world asset",
|
|
52
|
+
"rwa",
|
|
53
|
+
"tokenized",
|
|
54
|
+
"treasury",
|
|
55
|
+
"t-bill",
|
|
56
|
+
"bond",
|
|
57
|
+
"real estate",
|
|
58
|
+
"commodity",
|
|
59
|
+
"securitization",
|
|
60
|
+
"ondo",
|
|
61
|
+
"centrifuge",
|
|
62
|
+
"maple",
|
|
63
|
+
"clearpool",
|
|
64
|
+
"institutional",
|
|
65
|
+
],
|
|
66
|
+
"depin": [
|
|
67
|
+
"depin",
|
|
68
|
+
"decentralized physical",
|
|
69
|
+
"iot",
|
|
70
|
+
"sensor",
|
|
71
|
+
"wireless",
|
|
72
|
+
"helium",
|
|
73
|
+
"hivemapper",
|
|
74
|
+
"render",
|
|
75
|
+
"filecoin",
|
|
76
|
+
"storage",
|
|
77
|
+
"compute network",
|
|
78
|
+
"physical infrastructure",
|
|
79
|
+
"hardware",
|
|
80
|
+
],
|
|
81
|
+
"meme": [
|
|
82
|
+
"meme",
|
|
83
|
+
"memecoin",
|
|
84
|
+
"doge",
|
|
85
|
+
"shib",
|
|
86
|
+
"pepe",
|
|
87
|
+
"bonk",
|
|
88
|
+
"wif",
|
|
89
|
+
"community token",
|
|
90
|
+
"fair launch",
|
|
91
|
+
"pump fun",
|
|
92
|
+
"solana meme",
|
|
93
|
+
"based",
|
|
94
|
+
"moon",
|
|
95
|
+
"ape",
|
|
96
|
+
"degen",
|
|
97
|
+
],
|
|
98
|
+
"l2_scaling": [
|
|
99
|
+
"layer 2",
|
|
100
|
+
"l2",
|
|
101
|
+
"rollup",
|
|
102
|
+
"optimistic",
|
|
103
|
+
"zk rollup",
|
|
104
|
+
"zero knowledge",
|
|
105
|
+
"arbitrum",
|
|
106
|
+
"optimism",
|
|
107
|
+
"base",
|
|
108
|
+
"zksync",
|
|
109
|
+
"starknet",
|
|
110
|
+
"scroll",
|
|
111
|
+
"polygon",
|
|
112
|
+
"scaling",
|
|
113
|
+
"throughput",
|
|
114
|
+
"tps",
|
|
115
|
+
],
|
|
116
|
+
"restaking": [
|
|
117
|
+
"restaking",
|
|
118
|
+
"eigenlayer",
|
|
119
|
+
"liquid restaking",
|
|
120
|
+
"lrt",
|
|
121
|
+
"avs",
|
|
122
|
+
"actively validated",
|
|
123
|
+
"ether.fi",
|
|
124
|
+
"puffer",
|
|
125
|
+
"renzo",
|
|
126
|
+
"kelp",
|
|
127
|
+
"shared security",
|
|
128
|
+
"slashing",
|
|
129
|
+
],
|
|
130
|
+
"defi_revival": [
|
|
131
|
+
"defi",
|
|
132
|
+
"decentralized finance",
|
|
133
|
+
"yield",
|
|
134
|
+
"lending",
|
|
135
|
+
"borrowing",
|
|
136
|
+
"dex",
|
|
137
|
+
"amm",
|
|
138
|
+
"liquidity",
|
|
139
|
+
"tvl",
|
|
140
|
+
"aave",
|
|
141
|
+
"uniswap",
|
|
142
|
+
"curve",
|
|
143
|
+
"maker",
|
|
144
|
+
"compound",
|
|
145
|
+
"perp",
|
|
146
|
+
"perpetual",
|
|
147
|
+
],
|
|
148
|
+
"gaming": [
|
|
149
|
+
"gamefi",
|
|
150
|
+
"gaming",
|
|
151
|
+
"play to earn",
|
|
152
|
+
"p2e",
|
|
153
|
+
"nft game",
|
|
154
|
+
"metaverse",
|
|
155
|
+
"virtual world",
|
|
156
|
+
"axie",
|
|
157
|
+
"immutable",
|
|
158
|
+
"gala",
|
|
159
|
+
"illuvium",
|
|
160
|
+
"guild",
|
|
161
|
+
"esports",
|
|
162
|
+
"blockchain game",
|
|
163
|
+
],
|
|
164
|
+
"regulation": [
|
|
165
|
+
"regulation",
|
|
166
|
+
"sec",
|
|
167
|
+
"cftc",
|
|
168
|
+
"compliance",
|
|
169
|
+
"etf",
|
|
170
|
+
"spot etf",
|
|
171
|
+
"bitcoin etf",
|
|
172
|
+
"legislation",
|
|
173
|
+
"framework",
|
|
174
|
+
"license",
|
|
175
|
+
"ban",
|
|
176
|
+
"legal",
|
|
177
|
+
"enforcement",
|
|
178
|
+
"stablecoin bill",
|
|
179
|
+
"mica",
|
|
180
|
+
],
|
|
181
|
+
"btc_ecosystem": [
|
|
182
|
+
"bitcoin",
|
|
183
|
+
"btc",
|
|
184
|
+
"ordinals",
|
|
185
|
+
"inscription",
|
|
186
|
+
"brc-20",
|
|
187
|
+
"rune",
|
|
188
|
+
"runes",
|
|
189
|
+
"lightning",
|
|
190
|
+
"taproot",
|
|
191
|
+
"nostr",
|
|
192
|
+
"stacks",
|
|
193
|
+
"bitcoin l2",
|
|
194
|
+
"halving",
|
|
195
|
+
"satoshi",
|
|
196
|
+
"bitcoin defi",
|
|
197
|
+
],
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@dataclass
|
|
202
|
+
class NarrativeResult:
|
|
203
|
+
"""Result of narrative detection."""
|
|
204
|
+
|
|
205
|
+
narrative: str # detected narrative label
|
|
206
|
+
confidence: float # 0-1
|
|
207
|
+
related_tokens: list[str] # tokens associated with this narrative
|
|
208
|
+
keywords: list[str] # top keywords found
|
|
209
|
+
trend_direction: str # 'emerging', 'peaking', 'fading'
|
|
210
|
+
mention_count: int
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
class NarrativeDetectorModel:
|
|
214
|
+
"""Narrative detection using TF-IDF keyword matching against known clusters.
|
|
215
|
+
|
|
216
|
+
Performs lightweight TF-IDF vectorization and scores each known narrative
|
|
217
|
+
based on keyword frequency and contextual signals in the input text corpus.
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
def __init__(self) -> None:
|
|
221
|
+
self.version = "0.1.0"
|
|
222
|
+
self.is_loaded = False
|
|
223
|
+
self.last_trained: str | None = None
|
|
224
|
+
self.accuracy: float | None = None
|
|
225
|
+
self.narratives = KNOWN_NARRATIVES
|
|
226
|
+
self.keywords = NARRATIVE_KEYWORDS
|
|
227
|
+
|
|
228
|
+
def load(self) -> None:
|
|
229
|
+
"""Initialize model (keyword-based, always ready)."""
|
|
230
|
+
self.is_loaded = True
|
|
231
|
+
|
|
232
|
+
def detect(self, texts: list[str]) -> list[NarrativeResult]:
|
|
233
|
+
"""Detect narratives from a corpus of texts.
|
|
234
|
+
|
|
235
|
+
Performs TF-IDF vectorization and matches against known narrative
|
|
236
|
+
keyword clusters. Returns results sorted by confidence descending.
|
|
237
|
+
"""
|
|
238
|
+
if not texts:
|
|
239
|
+
return []
|
|
240
|
+
|
|
241
|
+
# Build document frequency across corpus
|
|
242
|
+
corpus_lower = [t.lower() for t in texts]
|
|
243
|
+
total_docs = len(corpus_lower)
|
|
244
|
+
|
|
245
|
+
# Compute TF-IDF scores per narrative
|
|
246
|
+
narrative_scores: dict[str, dict[str, float]] = {}
|
|
247
|
+
|
|
248
|
+
for narrative, kw_list in self.keywords.items():
|
|
249
|
+
matched_keywords: list[str] = []
|
|
250
|
+
total_tf_idf = 0.0
|
|
251
|
+
mention_count = 0
|
|
252
|
+
|
|
253
|
+
for keyword in kw_list:
|
|
254
|
+
# Document frequency: how many docs contain this keyword
|
|
255
|
+
df = sum(1 for doc in corpus_lower if keyword in doc)
|
|
256
|
+
if df == 0:
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
# IDF: log(N / df)
|
|
260
|
+
idf = math.log(total_docs / df) + 1.0
|
|
261
|
+
|
|
262
|
+
# TF: total occurrences across all docs
|
|
263
|
+
tf = sum(doc.count(keyword) for doc in corpus_lower)
|
|
264
|
+
mention_count += tf
|
|
265
|
+
|
|
266
|
+
tf_idf = (1 + math.log(tf)) * idf if tf > 0 else 0.0
|
|
267
|
+
total_tf_idf += tf_idf
|
|
268
|
+
matched_keywords.append(keyword)
|
|
269
|
+
|
|
270
|
+
if matched_keywords:
|
|
271
|
+
narrative_scores[narrative] = {
|
|
272
|
+
"score": total_tf_idf,
|
|
273
|
+
"mention_count": mention_count,
|
|
274
|
+
"keywords": matched_keywords,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if not narrative_scores:
|
|
278
|
+
return []
|
|
279
|
+
|
|
280
|
+
# Normalize scores to 0-1 confidence range
|
|
281
|
+
max_score = max(s["score"] for s in narrative_scores.values())
|
|
282
|
+
if max_score == 0:
|
|
283
|
+
max_score = 1.0
|
|
284
|
+
|
|
285
|
+
results: list[NarrativeResult] = []
|
|
286
|
+
for narrative, data in narrative_scores.items():
|
|
287
|
+
raw_confidence = data["score"] / max_score
|
|
288
|
+
confidence = min(1.0, raw_confidence)
|
|
289
|
+
mention_count = int(data["mention_count"])
|
|
290
|
+
|
|
291
|
+
# Extract related token symbols from texts
|
|
292
|
+
related_tokens = self._extract_tokens(corpus_lower, narrative)
|
|
293
|
+
|
|
294
|
+
# Determine trend direction based on mention distribution
|
|
295
|
+
trend_direction = self._estimate_trend(corpus_lower, data["keywords"])
|
|
296
|
+
|
|
297
|
+
# Top keywords sorted by actual occurrence
|
|
298
|
+
kw_counts = [
|
|
299
|
+
(kw, sum(doc.count(kw) for doc in corpus_lower))
|
|
300
|
+
for kw in data["keywords"]
|
|
301
|
+
]
|
|
302
|
+
kw_counts.sort(key=lambda x: x[1], reverse=True)
|
|
303
|
+
top_keywords = [kw for kw, _ in kw_counts[:5]]
|
|
304
|
+
|
|
305
|
+
results.append(
|
|
306
|
+
NarrativeResult(
|
|
307
|
+
narrative=narrative,
|
|
308
|
+
confidence=round(confidence, 4),
|
|
309
|
+
related_tokens=related_tokens[:10],
|
|
310
|
+
keywords=top_keywords,
|
|
311
|
+
trend_direction=trend_direction,
|
|
312
|
+
mention_count=mention_count,
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
results.sort(key=lambda r: r.confidence, reverse=True)
|
|
317
|
+
return results
|
|
318
|
+
|
|
319
|
+
def predict(self, features: dict) -> NarrativeResult:
|
|
320
|
+
"""API-compatible prediction (takes {"texts": [...]}).
|
|
321
|
+
|
|
322
|
+
Returns the top narrative result.
|
|
323
|
+
"""
|
|
324
|
+
texts = features.get("texts", [])
|
|
325
|
+
results = self.detect(texts)
|
|
326
|
+
if not results:
|
|
327
|
+
return NarrativeResult(
|
|
328
|
+
narrative="unknown",
|
|
329
|
+
confidence=0.0,
|
|
330
|
+
related_tokens=[],
|
|
331
|
+
keywords=[],
|
|
332
|
+
trend_direction="fading",
|
|
333
|
+
mention_count=0,
|
|
334
|
+
)
|
|
335
|
+
return results[0]
|
|
336
|
+
|
|
337
|
+
def get_trending_narratives(
|
|
338
|
+
self, texts: list[str], top_k: int = 5
|
|
339
|
+
) -> list[NarrativeResult]:
|
|
340
|
+
"""Get the top-k trending narratives from the text corpus."""
|
|
341
|
+
results = self.detect(texts)
|
|
342
|
+
return results[:top_k]
|
|
343
|
+
|
|
344
|
+
def _extract_tokens(self, docs: list[str], narrative: str) -> list[str]:
|
|
345
|
+
"""Extract cryptocurrency token symbols mentioned alongside narrative keywords."""
|
|
346
|
+
# Common token symbol pattern: $SYMBOL or uppercase 2-5 letter words
|
|
347
|
+
token_pattern = re.compile(r"\$([A-Z]{2,10})\b|(?<!\w)([A-Z]{2,5})(?!\w)")
|
|
348
|
+
token_counts: Counter[str] = Counter()
|
|
349
|
+
|
|
350
|
+
narrative_kws = self.keywords.get(narrative, [])
|
|
351
|
+
for doc in docs:
|
|
352
|
+
# Only count tokens in docs that contain narrative keywords
|
|
353
|
+
has_narrative = any(kw in doc for kw in narrative_kws)
|
|
354
|
+
if not has_narrative:
|
|
355
|
+
continue
|
|
356
|
+
matches = token_pattern.findall(doc.upper())
|
|
357
|
+
for match in matches:
|
|
358
|
+
symbol = match[0] or match[1]
|
|
359
|
+
# Filter common English words that look like tickers
|
|
360
|
+
if symbol not in {
|
|
361
|
+
"THE",
|
|
362
|
+
"AND",
|
|
363
|
+
"FOR",
|
|
364
|
+
"WITH",
|
|
365
|
+
"FROM",
|
|
366
|
+
"THIS",
|
|
367
|
+
"THAT",
|
|
368
|
+
"HAS",
|
|
369
|
+
"ARE",
|
|
370
|
+
"WAS",
|
|
371
|
+
"NOT",
|
|
372
|
+
"BUT",
|
|
373
|
+
"ALL",
|
|
374
|
+
"CAN",
|
|
375
|
+
"HAD",
|
|
376
|
+
"HER",
|
|
377
|
+
"ONE",
|
|
378
|
+
"OUR",
|
|
379
|
+
"OUT",
|
|
380
|
+
"NEW",
|
|
381
|
+
}:
|
|
382
|
+
token_counts[symbol] += 1
|
|
383
|
+
|
|
384
|
+
return [t for t, _ in token_counts.most_common(10)]
|
|
385
|
+
|
|
386
|
+
def _estimate_trend(self, docs: list[str], keywords: list[str]) -> str:
|
|
387
|
+
"""Estimate whether a narrative is emerging, peaking, or fading.
|
|
388
|
+
|
|
389
|
+
Splits the document corpus into thirds (chronological order assumed)
|
|
390
|
+
and compares keyword density across segments.
|
|
391
|
+
"""
|
|
392
|
+
if len(docs) < 3:
|
|
393
|
+
return "emerging"
|
|
394
|
+
|
|
395
|
+
third = max(1, len(docs) // 3)
|
|
396
|
+
early = docs[:third]
|
|
397
|
+
middle = docs[third : 2 * third]
|
|
398
|
+
late = docs[2 * third :]
|
|
399
|
+
|
|
400
|
+
def count_mentions(segment: list[str]) -> int:
|
|
401
|
+
return sum(
|
|
402
|
+
doc.count(kw) for doc in segment for kw in keywords
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
early_count = count_mentions(early)
|
|
406
|
+
middle_count = count_mentions(middle)
|
|
407
|
+
late_count = count_mentions(late)
|
|
408
|
+
|
|
409
|
+
# Normalize by segment size
|
|
410
|
+
early_density = early_count / max(1, len(early))
|
|
411
|
+
middle_density = middle_count / max(1, len(middle))
|
|
412
|
+
late_density = late_count / max(1, len(late))
|
|
413
|
+
|
|
414
|
+
if late_density > middle_density * 1.2 and late_density > early_density:
|
|
415
|
+
return "emerging"
|
|
416
|
+
if middle_density >= early_density and middle_density >= late_density:
|
|
417
|
+
return "peaking"
|
|
418
|
+
return "fading"
|